diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..617a625 --- /dev/null +++ b/.gitignore @@ -0,0 +1,184 @@ + +# Created by https://www.toptal.com/developers/gitignore/api/python,vim,visualstudiocode +# Edit at https://www.toptal.com/developers/gitignore?templates=python,vim,visualstudiocode + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +### Vim ### +# Swap +[._]*.s[a-v][a-z] +!*.svg # comment out if you don't need vector files +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim +Sessionx.vim + +# Temporary +.netrwhist +*~ +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +*.code-workspace + +# Local History for Visual Studio Code +.history/ + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# End of https://www.toptal.com/developers/gitignore/api/python,vim,visualstudiocode + +.DS_Store +*~ \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..f2cf558 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,72 @@ +--- +# This file is a template, and might need editing before it works on your project. +# To contribute improvements to CI/CD templates, please follow the Development guide at: +# https://docs.gitlab.com/ee/development/cicd/templates.html +# This specific template is located at: +# https://gitlab.com/gitlab-org/gitlab/-/blob/master/lib/gitlab/ci/templates/Python.gitlab-ci.yml + +# Note that the Gitlab Runner machine is configured to use MITRE repo +image: python:3 + +# Change pip's cache directory to be inside the project directory since we can +# only cache local items. +variables: + PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip" + +# Pip's cache doesn't store the python packages +# https://pip.pypa.io/en/stable/reference/pip_install/#caching +# +# If you want to also cache the installed packages, you have to install +# them in a virtualenv and cache it as well. +cache: + paths: + - .cache/pip + - venv/ + +before_script: + - python -V + - python -m venv venv + - source venv/bin/activate + - pip install -r tools/requirements.txt + +lint yaml: + stage: test + script: + - pip install -r tests/requirements.txt + - yamllint -c tests/.yamllint . + rules: + - changes: + - "*.yaml" + - "*.yml" + +validate data: + stage: test + script: + - pip install -r tests/requirements.txt + # Run tests with minimal console output, produce report, and fail on warnings + - pytest --tb=line --junitxml=report.xml -W error::UserWarning + - yamllint -c tests/.yamllint . + artifacts: + when: always + reports: + junit: report.xml + rules: + - changes: + - data/*.yaml # Source data was updated + - tests/*.py # Any tests changed + - conftest.py # Any test fixtures changed + +# Checks that a generated ATLAS.yaml matches the one commited to this project. +# Fails if they are different, only runs on merge requests or protected branches +check ATLAS.yaml up-to-date: + stage: test + script: + - python tools/create_matrix.py + - git diff --exit-code dist/ATLAS.yaml || exit_code=$? + - if [[ $exit_code -ne 0 ]]; then echo 'Runner-generated dist/ATLAS.yaml is different from remote repository version - run tools/create_matrix.py to update and commit the result.'; exit 123; fi; + rules: + # Default branch, main, tags, and all types of merge request pipelines. + - if: $CI_MERGE_REQUEST_IID + - if: $CI_COMMIT_TAG + - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + - if: '$CI_COMMIT_BRANCH == "main"' diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..11def8d --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,94 @@ +# ATLAS Data Changelog + +## [3.0.0]() (2022-03-23) + +Move to new GitHub repository under the `mitre-atlas` group + +#### Distributed files +- Renamed case study JSON schema file and updated to include `study` key expected by the ATLAS website +- Added README.md with usage + +#### Case studies +- Minor title updates + +## [2.4.0]() (2022-03-10) + +Repository re-org and cleanup, added READMEs to all directories + +#### Distributed files +- Moved `ATLAS.yaml` into a new `dist` directory +- Added JSON Schema files for `ATLAS.yaml` and case study files as created by the ATLAS website to `dist/schemas` directory + +#### Schemas +- Moved schemas from test fixtures into their own directory + +#### Tools +- Moved Navigator scripts to a separate repository +- Added case study file import script +- Added JSON Schema generation script + +## [2.3.1]() (2022-02-07) + +#### Tools +- ATLAS YAML generation script uses Jinja template evaluation and handles relative `!include` filepaths + +## [2.3.0]() (2022-01-24) + +#### Tactics and techniques +- Adapted referenced ATT&CK tactics into the ATLAS framework + + Updated descriptions to be machine learning-specific + + Changed IDs to ATLAS IDs +- Added ATLAS techniques used in new case studies, adapted from ATT&CK with updated ATLAS IDs and descriptions + + Data from Information Repositories + + Establish Accounts + + Valid Accounts + +#### Case studies +- Added key `incident-date-granularity` to case study files with values `DATE`, `MONTH`, or `YEAR` indicating the specificity of the `incident-date` + +## [2.2.1]() (2021-12-08) + +Fixes to all data + +#### Tests +- Added pytest suite for data validation and syntax checks + +## [2.2.0]() (2021-10-29) + +#### Case studies +- Added new case studies + 1. AML.CS0013 + 2. AML.CS0014 + +#### Tools +- Removed retrieval and usage of ATT&CK Enterprise data + +## [2.1.0]() (2021-08-31) + +`advmlthreatmatrix` renamed to `ATLAS` + +- Scripts updated accordingly +- Fixes to all data + +## [2.0.1]() (2021-06-11) + +Fixes to all data + +#### Tools +- Added data validation script + +## [2.0.0]() (2021-05-13) + +#### Distributed files +- Added `ATLAS.yaml` file with all tactics, techniques, and case studies + +#### Tactics and techniques +- Removed hardcoded IDs in favor of YAML anchors and template syntax + +#### Tools +- Added `ATLAS.yaml` generation script +- Added ATT&CK Enterprise v9 STIX retrieval and conversion script + +## [1.0.0]() (2021-02-17) + +Initial data definition diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..3f83e3e --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,35 @@ +# Contributing to ATLAS Data + +Contributions are welcome - feel free to use the issues or make pull requests to the `develop` branch for general questions and fixes. + +To propose additions or significant changes to the ATLAS framework, please email [atlas@mitre.org](mailto:atlas@mitre.org). + +To help construct case study submissions, please use the [case study builder](https://atlas.mitre.org/studies/create). + +## Developer's Certificate of Origin 1.1 + +``` +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. +``` diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6d15b31 --- /dev/null +++ b/LICENSE @@ -0,0 +1,13 @@ +Copyright 2021-2022 MITRE + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..18878c9 --- /dev/null +++ b/README.md @@ -0,0 +1,128 @@ +# MITRE | ATLAS Data + +ATLAS enables researchers to navigate the landscape of threats to artificial intelligence and machine learning systems. Visit https://atlas.mitre.org for more information. + +This repository contains the tactics, techniques, and case studies data used by the ATLAS website and associated tools. + +## Distributed files + +Located the `dist` directory: + +- `ATLAS.yaml` + + All ATLAS-related data available in one file + + See the schemas and usage below for more details. Top-level keys include: + ```yaml + id: ATLAS + name: ATLAS Machine Learning Threat Matrix + version: Version number for this data release + tactics: List of tactics objects + techniques: List of technique and subtechnique objects + case-studies: List of case study objects + ``` +- `schemas/` + + Optional JSON Schema files for validation use + + `atlas_matrix_schema.json` + * Describes the `ATLAS.yaml` format + + `atlas_website_case_study_schema.json` + * Describes the case study file format + +### Getting the files + +Clone this repository to get access to the distributed files, or alternatively directly access via raw GitHub link. + +#### As a Git submodule + +The [ATLAS Website](https://github.com/mitre-atlas/atlas-website) uses this data repository as a Git submodule for access to the distributed files. + +To add this repository as a submodule to your own repository, run the following which clones into the directory `atlas-data`. + +```bash +git submodule add -b main +``` + +Once the submodule is available, run the following once to sparse checkout only the necessary files in the `dist` directory. Assumes that the submodule is available at the path `atlas-data`. +```bash +git -C atlas-data config core.sparseCheckout true +echo 'dist/*' >> .git/modules/atlas-data/info/sparse-checkout +git submodule update --force --checkout atlas-data +``` + +To update `atlas-data`, run `git submodule update --remote` to get the latest from its main branch, then commit the result. + +### Example usage + +The following code blocks show examples of parsing ATLAS data. Assume `atlas_data_filepath` holds the path to the `ATLAS.yaml` file. + +#### Python +```python +# pip install pyyaml +import yaml + +with open(atlas_data_filepath) as f: + # Parse YAML + data = yaml.safe_load(f) + + tactics = data['tactics'] + techniques = data['techniques'] + studies = data['case-studies'] +``` + +#### NodeJS +```js +const fs = require('fs') +// npm install js-yaml +const yaml = require('js-yaml') + +fs.readFile(atlas_data_filepath, 'utf-8', (_, contents) => { + // Parse YAML + const data = yaml.load(contents) + + const tactics = data['tactics'] + const techniques = data['techniques'] + const studies = data['case-studies'] +}) +``` + +### JSON Schema validation example + +JSON Schema files are generated from this project's internal [schemas](schemas/README.md) for other tools to use. For example, the ATLAS website validates uploaded case study files against the case study schema file with the following: + +#### NodeJS + +```js +// npm install jsonschema +import { validate } from 'jsonschema' +import caseStudySchema from '' + +// Assume this is a populated website case study object +const caseStudyObj = {...} + +// Validate case study object against schema and emit errors that may occur from nested `anyOf` validations +const validatorResult = validate(caseStudyObj, caseStudySchema, { nestedErrors: true }) + +if (validatorResult.valid) { + // Good +} else { + // Process validatorResult.errors +} + +``` + +## Development + +This repository also contains the source data and scripts to customize and expand the ATLAS framework. See [setup instructions](tools/README.md#development-setup) and the READMEs in each directory linked below for usage. + +- [Data](data/README.md) holds templated data for ATLAS tactics, techniques, and case studies, from which `ATLAS.yaml` is generated. +- [Schemas](schemas/README.md) defines each ATLAS object type and ID. +- [Tools](tools/README.md) contains scripts to generate the distributed files and import data files. + +**Tests** + +This project uses `pytest` for data validation. See [tests](tests/README.md) for more information. + + +## Related work + +ATLAS is modeled after the [MITRE ATT&CK® framework](https://attack.mitre.org). ATLAS tactics and techniques can be complementary to those in ATT&CK. + +ATLAS data is also available in [STIX and ATT&CK Navigator layer formats](https://github.mitre.org/mitre-atlas/atlas-navigator-data) for use with the [ATLAS Navigator](https://mitre-atlas.github.io/atlas-navigator/). diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..60be81e --- /dev/null +++ b/conftest.py @@ -0,0 +1,145 @@ +import datetime + +import pytest +from schema import Or, Optional, Regex, Schema + +from schemas import atlas_matrix, atlas_obj +from tools.create_matrix import load_atlas_data + +""" +Defines global pytest fixtures for ATLAS data and schemas. + +This file is in the top-level of the repo for access to tools and schemas. + +https://docs.pytest.org/en/6.2.x/fixture.html#conftest-py-sharing-fixtures-across-multiple-files +""" + +#region Parameterized fixtures +@pytest.fixture(scope='session') +def matrix(request): + """Represents the ATLAS matrix (ATLAS.yaml) dictionary.""" + return request.param + +@pytest.fixture(scope='session') +def tactics(request): + """Represents each tactic dictionary.""" + return request.param + +@pytest.fixture(scope='session') +def techniques(request): + """Represents each technique dictionary""" + return request.param + +@pytest.fixture(scope='session') +def case_studies(request): + """Represents each case study dictionary.""" + return request.param + +@pytest.fixture(scope='session') +def text_with_possible_markdown_syntax(request): + """Represents the descriptions field of tactics, techniques, and case study procedure steps, + which can have Markdown links and syntax. + """ + return request.param + +@pytest.fixture(scope='session') +def text_to_be_spellchecked(request): + """Represents the text fields that can be spellchecked, including: + - tactic and technique names and descriptions + - case study names and summaries, procedure step descriptions + """ + return request.param +#endregion + +def pytest_generate_tests(metafunc): + """Enables test functions that use the above fixtures to operate on a + single dictionary, where each test function is automatically run once + for each dictionary in the tactics/techniques/case studies lists. + + Loads in the ATLAS data and sets up the pytest scheme to yield one + dictionary for each above fixture, as well as other test fixtures. + + https://docs.pytest.org/en/stable/parametrize.html#basic-pytest-generate-tests-example + """ + # Read the YAML files in this repository and create the nested dictionary + path_to_matrix_file = 'data/matrix.yaml' + data = load_atlas_data(path_to_matrix_file) + + # Parametrize when called for via test signature + if 'matrix' in metafunc.fixturenames: + # Only one arg, wrap in list + metafunc.parametrize('matrix', [data], indirect=True, scope='session') + + ## Create parameterized fixtures for tactics, techniques, and case studies for schema validation + + # These are the top-level keys of that dictionary + # and also the names of the fixtures we'd like to generate. + # Note the underscore instead of the dash + keys = ['tactics', 'techniques', 'case_studies'] + + for key in keys: + # Parametrize when called for via test signature + if key in metafunc.fixturenames: + # Handle the key 'case_studies' really being 'case-studies' in the input + values = data[key.replace('_','-')] + # Parametrize each object, using the ID as identifier + metafunc.parametrize(key, values, ids=lambda x: x['id'], indirect=True, scope='session') + + ## Create parameterized fixtures for Markdown link syntax verification - technique descriptions and case study procedure steps + + # Parameter format is (test_identifier, text) + text_with_possible_markdown_syntax = [(f"{t['id']} Description", t['description']) for t in data['techniques']] + for cs in data['case-studies']: + # Identify in test with case study ID + P#{1-based index of procedure step} + text_with_possible_markdown_syntax.extend([(f"{cs['id']} Procedure #{i+1}", p['description']) for i, p in enumerate(cs['procedure'])]) + # Parametrize when called for via test signature + if 'text_with_possible_markdown_syntax' in metafunc.fixturenames: + metafunc.parametrize('text_with_possible_markdown_syntax', text_with_possible_markdown_syntax, ids=lambda x: x[0], indirect=True, scope='session') + + ## Create parameterized fixtures for text to be spell-checked - names, descriptions, summary + # Parameter format is (text_identifier, text) + + # Start with existing descriptions from technique descriptions and case study procedure steps + text_to_be_spellchecked = text_with_possible_markdown_syntax + # Tactic text + for t in data['tactics']: + text_to_be_spellchecked.append((f"{t['id']} Name", t['name'])) + text_to_be_spellchecked.append((f"{t['id']} Description", t['description'])) + # Already contains technique descriptions, add names + text_to_be_spellchecked.extend([(f"{t['id']} Name", t['name']) for t in data['techniques']]) + # Case study text + for cs in data['case-studies']: + text_to_be_spellchecked.append((f"{cs['id']} Name", cs['name'])) + text_to_be_spellchecked.append((f"{cs['id']} Summary", cs['summary'])) + + # Parametrize when called for via test signature + if 'text_to_be_spellchecked' in metafunc.fixturenames: + metafunc.parametrize('text_to_be_spellchecked', text_to_be_spellchecked, ids=lambda x: x[0], indirect=True, scope='session') + + +#region Schemas +@pytest.fixture(scope='session') +def matrix_schema(): + """Defines the schema and validation for the ATLAS matrix.""" + return atlas_matrix.atlas_matrix_schema + +@pytest.fixture(scope='session') +def tactic_schema(): + """Defines the schema and validation for the tactic object.""" + return atlas_obj.tactic_schema + +@pytest.fixture(scope='session') +def technique_schema(): + """Defines the schema and validation for a top-level technique object.""" + return atlas_obj.technique_schema + +@pytest.fixture(scope='session') +def subtechnique_schema(): + """Defines the schema and validation for a subtechnique object.""" + return atlas_obj.subtechnique_schema + +@pytest.fixture(scope='session') +def case_study_schema(): + """Defines the schema and validation for a case study object.""" + return atlas_obj.case_study_schema +#endregion \ No newline at end of file diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..85ac959 --- /dev/null +++ b/data/README.md @@ -0,0 +1,76 @@ +# Data + +ATLAS data is stored in YAML files designed to be easy to read and edit, as well as to load, parse, and validate. + +- `matrix.yaml` contains metadata, tactics in matrix order, and includes the other data files. + +- `tactics.yaml` contains ATLAS tactics, which represent adversary goals. + +- `techniques.yaml` contains ATLAS techniques and subtechniques, which represent the means by which adversaries achieve tactical goals. + +- `case-studies/` is a directory containing ATLAS case study files, which describe select machine learning attack incidents and how they map to the ATLAS framework. + +## Anchors and templates + +Each tactic and technique object has a YAML anchor, which is prefaced with `&`. + +```yaml +- &supply_chain + id: AML.T0010 + name: ML Supply Chain Compromise + object-type: technique +``` + +Anchors are used as variable names throughout the files in template expressions, wrapped with `{{ }}`. + +```jinja +This data may be introduced to a victim system via [{{supply_chain.name}}](/techniques/{{supply_chain.id}}). +``` + +When using `tools/create_matrix.py` to generate the fully-populated `ATLAS.yaml` data file, these source files are evaluated as templates. The output of the evaluating the example above: + +```md +This data may be introduced to a victim system via [ML Supply Chain Compromise](/techniques/AML.T0010) +``` + +## Updating the data + +### Tactics and techniques + +Modify `tactics.yaml` and `techniques.yaml` for changes to the ATLAS framework itself. + +Ensure that object IDs are unique and follow the patterns defined in the schema. See definitions in `schemas` for ID patterns and object schemas. + +### Case studies + +Case study files, such as those downloaded from the ATLAS website, can be added manually or via the `tools/import_case_study_file.py` script. + +To import one or more case study files , run this from the project root: +``` +python tools/import_case_study_file.py +``` + +Each imported file has hardcoded tactic and technique IDs replaced with anchors, is assigned a case study ID, and is output `data/case-studies/.yaml`. + +### Custom data + +Custom ATLAS objects can also be added as new YAML files in `data/matrix.yaml`: + +```yaml +data: + - !include tactics.yaml # Path to YAML file containing ATLAS objects + - !include techniques.yaml # Relative to this data directory + - !include case-studies/*.yaml # Wildcard syntax is supported + - !include custom-objs.yaml # Add other custom files +``` + +Objects added via the `!include` syntax can be found in re-generated `ATLAS.yaml` in the corresponding `tactics`/`techniques`/`case-studies` depending on the object's `object-type`. + +### Output generation + +To re-generate `dist/ATLAS.yaml` after modifying these source files, run this from the project root: +``` +python tools/create_matrix.py +``` + +Use the argument `-o ` to output `ATLAS.yaml` into another directory. \ No newline at end of file diff --git a/data/case-studies/AML.CS0000.yaml b/data/case-studies/AML.CS0000.yaml new file mode 100644 index 0000000..4459616 --- /dev/null +++ b/data/case-studies/AML.CS0000.yaml @@ -0,0 +1,75 @@ +--- +id: AML.CS0000 +name: Evasion of Deep Learning Detector for Malware C&C Traffic +object-type: case-study +summary: 'Palo Alto Networks Security AI research team tested a deep learning model + for malware command and control (C&C) traffic detection in HTTP traffic. + + Based on the publicly available paper by Le et al. [1], we built a model that was + trained on a similar dataset as our production model and had performance similar + to it. + + Then we crafted adversarial samples and queried the model and adjusted the adversarial + sample accordingly till the model was evaded. + + ' +incident-date: 2020-01-01 +incident-date-granularity: YEAR +procedure: +- tactic: '{{reconnaissance.id}}' + technique: '{{victim_research_preprint.id}}' + description: 'We identified a machine learning based approach to malicious URL detection + as a representative approach and potential target from the paper "URLNet: Learning + a URL representation with deep learning for malicious URL detection" [1], which + was found on arXiv (a pre-print repository). + + ' +- tactic: '{{resource_development.id}}' + technique: '{{acquire_ml_artifacts_data.id}}' + description: 'We acquired a similar dataset to the target production model. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{train_proxy_model.id}}' + description: 'We built a model that was trained on a similar dataset as the production + model. + + We trained the model on ~ 33 million benign and ~ 27 million malicious HTTP packet + headers. + + Evaluation showed a true positive rate of ~ 99% and false positive rate of ~0.01%, + on average. + + Testing the model with a HTTP packet header from known malware command and control + traffic samples was detected as malicious with high confidence (> 99%). + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{craft_adv_manual.id}}' + description: 'We crafted evasion samples by removing fields from packet header which + are typically not used for C&C communication (e.g. cache-control, connection, + etc.) + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{verify_attack.id}}' + description: 'We queried the model with our adversarial examples and adjusted them + until the model was evaded. + + ' +- tactic: '{{defense_evasion.id}}' + technique: '{{evade_model.id}}' + description: 'With the crafted samples we performed online evasion of the ML-based + spyware detection model. + + The crafted packets were identified as benign with >80% confidence. + + This evaluation demonstrates that adversaries are able to bypass advanced ML detection + techniques, by crafting samples that are misclassified by an ML model. + + ' +reported-by: Palo Alto Networks (Network Security AI Research Team) +references: +- title: 'Le, Hung, et al. "URLNet: Learning a URL representation with deep learning + for malicious URL detection." arXiv preprint arXiv:1802.03162 (2018).' + url: https://arxiv.org/abs/1802.03162 diff --git a/data/case-studies/AML.CS0001.yaml b/data/case-studies/AML.CS0001.yaml new file mode 100644 index 0000000..76ec29b --- /dev/null +++ b/data/case-studies/AML.CS0001.yaml @@ -0,0 +1,70 @@ +--- +id: AML.CS0001 +name: Botnet Domain Generation Algorithm (DGA) Detection Evasion +object-type: case-study +summary: 'The Palo Alto Networks Security AI research team was able to bypass a Convolutional + Neural Network (CNN)-based botnet Domain Generation Algorithm (DGA) detection [1] + by domain name mutations. + + It is a generic domain mutation technique which can evade most ML-based DGA detection + modules. + + The generic mutation technique can also be used to test the effectiveness and robustness + of all DGA detection methods developed by security companies in the industry before + it is deployed to the production environment. + + ' +incident-date: 2020-01-01 +incident-date-granularity: YEAR +procedure: +- tactic: '{{reconnaissance.id}}' + technique: '{{victim_research.id}}' + description: 'DGA detection is a widely used technique to detect botnets in academia + and industry. + + The searched for research papers related to DGA detection. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{acquire_ml_artifacts.id}}' + description: 'The researchers acquired a publicly available CNN-based DGA detection + model [1] and tested against a well-known DGA generated domain name data sets, + which includes ~50 million domain names from 64 botnet DGA families. + + The CNN-based DGA detection model shows more than 70% detection accuracy on 16 + (~25%) botnet DGA families. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{develop_advml.id}}' + description: 'The researchers developed a generic mutation technique that requires + a minimal number of iterations. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{craft_adv_blackbox.id}}' + description: 'The researchers used the mutation technique to generate evasive domain + names. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{verify_attack.id}}' + description: 'Experiment results show that, after only one string is inserted once + to the DGA generated domain names, the detection rate of all 16 botnet DGA families + can drop to less than 25% detection accuracy. + + ' +- tactic: '{{defense_evasion.id}}' + technique: '{{evade_model.id}}' + description: 'The DGA generated domain names mutated with this technique successfully + evade the target DGA Detection model, allowing an adversary to continue communication + with their [Command and Control](https://attack.mitre.org/tactics/TA0011/) servers. + + ' +reported-by: Palo Alto Networks (Network Security AI Research Team) +references: +- title: '[1] Yu, Bin, Jie Pan, Jiaming Hu, Anderson Nascimento, and Martine De Cock. + "Character level based detection of DGA domain names." In 2018 International Joint + Conference on Neural Networks (IJCNN), pp. 1-8. IEEE, 2018. Source code is available + from Github: https://github.com/matthoffman/degas' + url: https://github.com/matthoffman/degas diff --git a/data/case-studies/AML.CS0002.yaml b/data/case-studies/AML.CS0002.yaml new file mode 100644 index 0000000..2e4a9ee --- /dev/null +++ b/data/case-studies/AML.CS0002.yaml @@ -0,0 +1,55 @@ +--- +id: AML.CS0002 +name: VirusTotal Poisoning +object-type: case-study +summary: 'An increase in reports of a certain ransomware family that was out of the + ordinary was noticed. + + In investigating the case, it was observed that many samples of that particular + ransomware family were submitted through a popular Virus-Sharing platform within + a short amount of time. + + Further investigation revealed that based on string similarity, the samples were + all equivalent, and based on code similarity they were between 98 and 74 percent + similar. + + Interestingly enough, the compile time was the same for all the samples. + + After more digging, the discovery was made that someone used ''metame'' a metamorphic + code manipulating tool to manipulate the original file towards mutant variants. + + The variants wouldn''t always be executable but still classified as the same ransomware + family. + + ' +incident-date: 2020-01-01 +incident-date-granularity: YEAR +procedure: +- tactic: '{{resource_development.id}}' + technique: '{{obtain_advml.id}}' + description: 'The actor obtained [metame](https://github.com/a0rtega/metame), a + simple metamorphic code engine for arbitrary executables. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{craft_adv.id}}' + description: 'The actor used a malware sample from a prevalent ransomware family + as a start to create ''mutant'' variants. + + ' +- tactic: '{{initial_access.id}}' + technique: '{{supply_chain_data.id}}' + description: 'The actor uploaded "mutant" samples to the platform. + + ' +- tactic: '{{persistence.id}}' + technique: '{{poison_data.id}}' + description: 'Several vendors started to classify the files as the ransomware family + even though most of them won''t run. + + The "mutant" samples poisoned the dataset the ML model(s) use to identify and + classify this ransomware family. + + ' +reported-by: Christiaan Beek (@ChristiaanBeek) - McAfee Advanced Threat Research +references: null diff --git a/data/case-studies/AML.CS0003.yaml b/data/case-studies/AML.CS0003.yaml new file mode 100644 index 0000000..c825daa --- /dev/null +++ b/data/case-studies/AML.CS0003.yaml @@ -0,0 +1,54 @@ +--- +id: AML.CS0003 +name: Bypassing Cylance's AI Malware Detection +object-type: case-study +summary: 'Researchers at Skylight were able to create a universal bypass string that + + when appended to a malicious file evades detection by Cylance''s AI Malware detector. + + ' +incident-date: 2019-09-07 +incident-date-granularity: DATE +procedure: +- tactic: '{{reconnaissance.id}}' + technique: '{{victim_website.id}}' + description: 'The researchers read publicly available information about Cylance''s + AI Malware detector. + + ' +- tactic: '{{ml_model_access.id}}' + technique: '{{ml_service.id}}' + description: 'The researchers used Cylance''s AI Malware detector and enabled verbose + logging to understand the inner workings of the ML model, particularly around + reputation scoring. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{develop_advml.id}}' + description: 'The researchers used the reputation scoring information to reverse + engineer which attributes provided what level of positive or negative reputation. + + Along the way, they discovered a secondary model which was an override for the + first model. + + Positive assessments from the second model overrode the decision of the core ML + model. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{craft_adv_manual.id}}' + description: 'Using this knowledge, the researchers fused attributes of known good + files with malware to manually create adversarial malware. + + ' +- tactic: '{{defense_evasion.id}}' + technique: '{{evade_model.id}}' + description: 'Due to the secondary model overriding the primary, the researchers + were effectively able to bypass the ML model. + + ' +reported-by: Research and work by Adi Ashkenazy, Shahar Zini, and Skylight Cyber team. + Notified to us by Ken Luu (@devianz_) +references: +- title: Skylight Cyber Blog Post, "Cylance, I Kill You!" + url: https://skylightcyber.com/2019/07/18/cylance-i-kill-you/ diff --git a/data/case-studies/AML.CS0004.yaml b/data/case-studies/AML.CS0004.yaml new file mode 100644 index 0000000..947b00a --- /dev/null +++ b/data/case-studies/AML.CS0004.yaml @@ -0,0 +1,53 @@ +--- +id: AML.CS0004 +name: Camera Hijack Attack on Facial Recognition System +object-type: case-study +summary: 'This type of attack can break through the traditional live detection model + + and cause the misuse of face recognition. + + ' +incident-date: 2020-01-01 +incident-date-granularity: YEAR +procedure: +- tactic: '{{resource_development.id}}' + technique: '{{acquire_hw.id}}' + description: 'The attackers bought customized low-end mobile phones. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{obtain_tool.id}}' + description: 'The attackers obtained customized android ROMs and a virtual camera + application. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{obtain_advml.id}}' + description: 'The attackers obtained software that turns static photos into videos, + adding realistic effects such as blinking eyes. + + ' +- tactic: '{{collection.id}}' + technique: '{{info_repos.id}}' + description: 'The attackers collected user identity information and face photos. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{establish_accounts.id}}' + description: 'The attackers registered accounts with the victims'' identity information. + + ' +- tactic: '{{ml_model_access.id}}' + technique: '{{ml_service.id}}' + description: 'The attackers used the virtual camera app to present the generated + video to the ML-based facial recognition product used for user verification. + + ' +- tactic: '{{impact.id}}' + technique: '{{evade_model.id}}' + description: 'The attackers successfully evaded the face recognition system and + impersonated the victim. + + ' +reported-by: Henry Xuef, Ant Group AISEC Team +references: null diff --git a/data/case-studies/AML.CS0005.yaml b/data/case-studies/AML.CS0005.yaml new file mode 100644 index 0000000..704eb1e --- /dev/null +++ b/data/case-studies/AML.CS0005.yaml @@ -0,0 +1,80 @@ +--- +id: AML.CS0005 +name: Attack on Machine Translation Service - Google Translate, Bing Translator, and + Systran Translate +object-type: case-study +summary: 'Machine translation services (such as Google Translate, Bing Translator, + and Systran Translate) provide public-facing UIs and APIs. + + A research group at UC Berkeley utilized these public endpoints to create an replicated + model with near-production, state-of-the-art translation quality. + + Beyond demonstrating that IP can be stolen from a black-box system, they used the + replicated model to successfully transfer adversarial examples to the real production + services. + + These adversarial inputs successfully cause targeted word flips, vulgar outputs, + and dropped sentences on Google Translate and Systran Translate websites. + + ' +incident-date: 2020-04-30 +incident-date-granularity: DATE +procedure: +- tactic: '{{reconnaissance.id}}' + technique: '{{victim_research.id}}' + description: 'The researchers used published research papers to identify the datasets + and model architectures used by the target translation services. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{acquire_ml_artifacts_data.id}}' + description: 'The researchers gathered similar datasets that the target translation + services used. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{acquire_ml_artifacts_model.id}}' + description: 'The researchers gathered similar model architectures that the target + translation services used. + + ' +- tactic: '{{ml_model_access.id}}' + technique: '{{inference_api.id}}' + description: 'They abuse a public facing application to query the model and produce + machine translated sentence pairs as training data. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{replicate_model.id}}' + description: 'Using these translated sentence pairs, the researchers trained a model + that replicates the behavior of the target model. + + ' +- tactic: '{{impact.id}}' + technique: '{{ip_theft.id}}' + description: 'By replicating the model with high fidelity, the researchers demonstrated + that an adversary could steal a model and violate the victim''s intellectual property + rights. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{craft_adv_transfer.id}}' + description: 'The replicated models were used to generate adversarial examples that + successfully transferred to the black-box translation services. + + ' +- tactic: '{{impact.id}}' + technique: '{{evade_model.id}}' + description: 'The adversarial examples were used to evade the machine translation + services. + + ' +reported-by: Work by Eric Wallace, Mitchell Stern, Dawn Song and reported by Kenny + Song (@helloksong) +references: +- title: Wallace, Eric, et al. "Imitation Attacks and Defenses for Black-box Machine + Translation Systems" EMNLP 2020 + url: https://arxiv.org/abs/2004.15015 +- title: Project Page, "Imitation Attacks and Defenses for Black-box Machine Translation + Systems" + url: https://www.ericswallace.com/imitation diff --git a/data/case-studies/AML.CS0006.yaml b/data/case-studies/AML.CS0006.yaml new file mode 100644 index 0000000..5da586a --- /dev/null +++ b/data/case-studies/AML.CS0006.yaml @@ -0,0 +1,34 @@ +--- +id: AML.CS0006 +name: ClearviewAI Misconfiguration +object-type: case-study +summary: 'Clearview AI''s source code repository, though password protected, was misconfigured + to allow an arbitrary user to register an account. + + This allowed an external researcher to gain access to a private code repository + that contained Clearview AI production credentials, keys to cloud storage buckets + containing 70K video samples, and copies of its applications and Slack tokens. + + With access to training data, a bad-actor has the ability to cause an arbitrary + misclassification in the deployed model. + + These kinds of attacks illustrate that any attempt to secure ML system should be + on top of "traditional" good cybersecurity hygiene such as locking down the system + with least privileges, multi-factor authentication and monitoring and auditing. + + ' +incident-date: 2020-04-16 +incident-date-granularity: DATE +procedure: +- tactic: '{{initial_access.id}}' + technique: '{{valid_accounts.id}}' + description: 'In this scenario, a security researcher gained initial access to via + a valid account that was created through a misconfiguration. + + ' +reported-by: Mossab Hussein (@mossab_hussein) +references: +- title: TechCrunch Article, "Security lapse exposed Clearview AI source code" + url: https://techcrunch.com/2020/04/16/clearview-source-code-lapse/amp/ +- title: Gizmodo Article, "We Found Clearview AI's Shady Face Recognition App" + url: https://gizmodo.com/we-found-clearview-ais-shady-face-recognition-app-1841961772 diff --git a/data/case-studies/AML.CS0007.yaml b/data/case-studies/AML.CS0007.yaml new file mode 100644 index 0000000..acffa30 --- /dev/null +++ b/data/case-studies/AML.CS0007.yaml @@ -0,0 +1,55 @@ +--- +id: AML.CS0007 +name: GPT-2 Model Replication +object-type: case-study +summary: 'OpenAI built GPT-2, a powerful natural language model and adopted a staged-release + process to incrementally release 1.5 Billion parameter model. + + Before the 1.5B parameter model could be released by OpenAI eventually, two ML researchers + replicated the model and released it to the public. + + ' +incident-date: 2019-08-22 +incident-date-granularity: DATE +procedure: +- tactic: '{{reconnaissance.id}}' + technique: '{{victim_research.id}}' + description: 'Using the public documentation about GPT-2, ML researchers gathered + information about the dataset, model architecture, and training hyper-parameters. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{acquire_ml_artifacts_model.id}}' + description: 'The researchers obtained a reference implementation of a similar publicly + available model called Grover. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{acquire_ml_artifacts_data.id}}' + description: 'The researchers were able to manually recreate the dataset used in + the original GPT-2 paper using the gathered documentation. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{acquire_workspaces.id}}' + description: 'The researchers were able to use TensorFlow Research Cloud via their + academic credentials. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{train_proxy_model.id}}' + description: 'The researchers modified Grover''s objective function to reflect GPT-2''s + objective function and then trained on the dataset they curated. + + They used Grover''s initial hyperparameters for training. + + This resulted in their replicated model. + + ' +reported-by: Vanya Cohen (@VanyaCohen), Aaron Gokaslan (@SkyLi0n), Ellie Pavlick, + Stefanie Tellex +references: +- title: Wired Article, "OpenAI Said Its Code Was Risky. Two Grads Re-Created It Anyway" + url: https://www.wired.com/story/dangerous-ai-open-source/ +- title: 'Medium BlogPost, "OpenGPT-2: We Replicated GPT-2 Because You Can Too"' + url: https://blog.usejournal.com/opengpt-2-we-replicated-gpt-2-because-you-can-too-45e34e6d36dc diff --git a/data/case-studies/AML.CS0008.yaml b/data/case-studies/AML.CS0008.yaml new file mode 100644 index 0000000..37bbce4 --- /dev/null +++ b/data/case-studies/AML.CS0008.yaml @@ -0,0 +1,52 @@ +--- +id: AML.CS0008 +name: ProofPoint Evasion +object-type: case-study +summary: 'CVE-2019-20634 describes how ML researchers evaded ProofPoint''s email protection + system by first building a copy-cat email protection ML model, and using the insights + to evade the live system. + + ' +incident-date: 2019-09-09 +incident-date-granularity: DATE +procedure: +- tactic: '{{resource_development.id}}' + technique: '{{acquire_ml_artifacts.id}}' + description: 'The researchers first gathered the scores from the Proofpoint''s ML + system used in email headers by sending a large number of emails through the system + and scraping the model scores exposed in the logs. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{acquire_ml_artifacts_data.id}}' + description: 'The researchers converted the collected scores into a dataset. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{train_proxy_model.id}}' + description: 'Using these scores, the researchers replicated the ML mode by building + a "shadow" aka copy-cat ML model. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{craft_adv_whitebox.id}}' + description: 'Next, the ML researchers algorithmically found samples that this "offline" + copy cat model. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{craft_adv_transfer.id}}' + description: 'Finally, these insights from the offline model allowed the researchers + to create malicious emails that received preferable scores from the real ProofPoint + email protection system, hence bypassing it. + + ' +reported-by: Will Pearce (@moo_hax), Nick Landers (@monoxgas) +references: +- title: National Vulnerability Database entry for CVE-2019-20634 + url: https://nvd.nist.gov/vuln/detail/CVE-2019-20634 +- title: '2019 DerbyCon presentation "42: The answer to life, the universe, and everything + offensive security"' + url: https://github.com/moohax/Talks/blob/master/slides/DerbyCon19.pdf +- title: Proof Pudding (CVE-2019-20634) Implementation on GitHub + url: https://github.com/moohax/Proof-Pudding diff --git a/data/case-studies/AML.CS0009.yaml b/data/case-studies/AML.CS0009.yaml new file mode 100644 index 0000000..ef29d01 --- /dev/null +++ b/data/case-studies/AML.CS0009.yaml @@ -0,0 +1,50 @@ +--- +id: AML.CS0009 +name: Tay Poisoning +object-type: case-study +summary: 'Microsoft created Tay, a twitter chatbot for 18 to 24 year-olds in the U.S. + for entertainment purposes. + + Within 24 hours of its deployment, Tay had to be decommissioned because it tweeted + reprehensible words. + + ' +incident-date: 2016-03-23 +incident-date-granularity: DATE +procedure: +- tactic: '{{ml_model_access.id}}' + technique: '{{inference_api.id}}' + description: 'Adversaries were able to interact with Tay via a few different publicly + available methods. + + ' +- tactic: '{{initial_access.id}}' + technique: '{{supply_chain_data.id}}' + description: 'Tay bot used the interactions with its twitter users as training data + to improve its conversations. + + Adversaries were able to coordinate with the intent of defacing Tay bot by exploiting + this feedback loop. + + ' +- tactic: '{{persistence.id}}' + technique: '{{poison_data.id}}' + description: 'By repeatedly interacting with Tay using racist and offensive language, + they were able to bias Tay''s dataset towards that language as well. + + ' +- tactic: '{{impact.id}}' + technique: '{{erode_integrity.id}}' + description: 'As a result of this coordinated attack, Tay''s conversation algorithms + began to learn to generate reprehensible material. + + This quickly lead to its decommissioning. + + ' +reported-by: Microsoft +references: +- title: Microsoft BlogPost, "Learning from Tay's introduction" + url: https://blogs.microsoft.com/blog/2016/03/25/learning-tays-introduction/ +- title: IEEE Article, "In 2016, Microsoft's Racist Chatbot Revealed the Dangers of + Online Conversation" + url: https://spectrum.ieee.org/tech-talk/artificial-intelligence/machine-learning/in-2016-microsofts-racist-chatbot-revealed-the-dangers-of-online-conversation diff --git a/data/case-studies/AML.CS0010.yaml b/data/case-studies/AML.CS0010.yaml new file mode 100644 index 0000000..13b7667 --- /dev/null +++ b/data/case-studies/AML.CS0010.yaml @@ -0,0 +1,48 @@ +--- +id: AML.CS0010 +name: Microsoft Azure Service Disruption +object-type: case-study +summary: The Azure Red Team and Azure Trustworthy ML team performed a red team exercise + on an internal Azure service with the intention of disrupting its service. This + operation had a combination of traditional ATT&CK enterprise techniques such as + finding Valid account, and Executing code via an API -- all interleaved with adversarial + ML specific steps such as offline and online evasion examples. +incident-date: 2020-01-01 +incident-date-granularity: YEAR +procedure: +- tactic: '{{reconnaissance.id}}' + technique: '{{victim_research.id}}' + description: 'The team first performed reconnaissance to gather information about + the target ML model. + + ' +- tactic: '{{initial_access.id}}' + technique: '{{valid_accounts.id}}' + description: 'The team used a valid account to gain access to the network. + + ' +- tactic: '{{collection.id}}' + technique: '{{ml_artifact_collection.id}}' + description: 'The team found the model file of the target ML model and the necessary + training data. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{craft_adv_whitebox.id}}' + description: 'Using the target model and data, the red team crafted evasive adversarial + data. + + ' +- tactic: '{{ml_model_access.id}}' + technique: '{{inference_api.id}}' + description: 'The team used an exposed API to access the target model. + + ' +- tactic: '{{impact.id}}' + technique: '{{evade_model.id}}' + description: 'The team performed an online evasion attack by replaying the adversarial + examples, which helped achieve this goal. + + ' +reported-by: Microsoft (Azure Trustworthy Machine Learning) +references: null diff --git a/data/case-studies/AML.CS0011.yaml b/data/case-studies/AML.CS0011.yaml new file mode 100644 index 0000000..79aa1cf --- /dev/null +++ b/data/case-studies/AML.CS0011.yaml @@ -0,0 +1,43 @@ +--- +id: AML.CS0011 +name: Microsoft Edge AI Evasion +object-type: case-study +summary: 'The Azure Red Team performed a red team exercise on a new Microsoft product + designed for running AI workloads at the Edge. + + ' +incident-date: 2020-02-01 +incident-date-granularity: MONTH +procedure: +- tactic: '{{reconnaissance.id}}' + technique: '{{victim_research.id}}' + description: 'The team first performed reconnaissance to gather information about + the target ML model. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{acquire_ml_artifacts.id}}' + description: 'The team identified and obtained the publicly available base model. + + ' +- tactic: '{{ml_model_access.id}}' + technique: '{{inference_api.id}}' + description: 'Then using the publicly available version of the ML model, started + sending queries and analyzing the responses (inferences) from the ML model. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{craft_adv_blackbox.id}}' + description: 'The red team created an automated system that continuously manipulated + an original target image, that tricked the ML model into producing incorrect inferences, + but the perturbations in the image were unnoticeable to the human eye. + + ' +- tactic: '{{impact.id}}' + technique: '{{evade_model.id}}' + description: 'Feeding this perturbed image, the red team was able to evade the ML + model by causing misclassifications. + + ' +reported-by: Microsoft +references: null diff --git a/data/case-studies/AML.CS0012.yaml b/data/case-studies/AML.CS0012.yaml new file mode 100644 index 0000000..f30673f --- /dev/null +++ b/data/case-studies/AML.CS0012.yaml @@ -0,0 +1,67 @@ +--- +id: AML.CS0012 +name: Face Identification System Evasion via Physical Countermeasures +object-type: case-study +summary: 'MITRE''s AI Red Team demonstrated a physical-domain evasion attack on a + commercial face identification service with the intention of inducing a targeted + misclassification. + + This operation had a combination of traditional ATT&CK enterprise techniques such + as finding Valid account, and Executing code via an API - all interleaved with adversarial + ML specific attacks. + + ' +incident-date: 2020-01-01 +incident-date-granularity: YEAR +procedure: +- tactic: '{{reconnaissance.id}}' + technique: '{{victim_research.id}}' + description: 'The team first performed reconnaissance to gather information about + the target ML model. + + ' +- tactic: '{{initial_access.id}}' + technique: '{{valid_accounts.id}}' + description: 'The team gained access via a valid account. + + ' +- tactic: '{{ml_model_access.id}}' + technique: '{{inference_api.id}}' + description: 'The team accessed the inference API of the target model. + + ' +- tactic: '{{discovery.id}}' + technique: '{{discover_model_ontology.id}}' + description: 'The team identified the list of identities targeted by the model by + querying the target model''s inference API. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{acquire_ml_artifacts_data.id}}' + description: 'The team acquired representative open source data. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{train_proxy_model.id}}' + description: 'The team developed a proxy model using the open source data. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{craft_adv_whitebox.id}}' + description: 'Using the proxy model, the red team optimized a physical domain patch-based + attack using expectation over transformation. + + ' +- tactic: '{{ml_model_access.id}}' + technique: '{{physical_env.id}}' + description: 'The team placed the physical countermeasure in the physical environment. + + ' +- tactic: '{{impact.id}}' + technique: '{{evade_model.id}}' + description: 'The team successfully evaded the model using the physical countermeasure + and causing targeted misclassifications. + + ' +reported-by: MITRE AI Red Team +references: null diff --git a/data/case-studies/AML.CS0013.yaml b/data/case-studies/AML.CS0013.yaml new file mode 100644 index 0000000..115f864 --- /dev/null +++ b/data/case-studies/AML.CS0013.yaml @@ -0,0 +1,108 @@ +--- +id: AML.CS0013 +name: Backdoor Attack on Deep Learning Models in Mobile Apps +object-type: case-study +summary: 'Deep learning models are increasingly used in mobile applications as critical + components. + + Researchers from Microsoft Research demonstrated that many deep learning models + deployed in mobile apps are vulnerable to backdoor attacks via "neural payload injection." + + They conducted an empirical study on real-world mobile deep learning apps collected + from Google Play, and found 54 apps that were vulnerable to attack, including popular + security and safety critical applications used for as cash recognition, parental + control, face authentication, and financial services among others. + + ' +incident-date: 2021-01-18 +incident-date-granularity: DATE +procedure: +- tactic: '{{reconnaissance.id}}' + technique: '{{search_apps.id}}' + description: 'To identify a list of potential target models, the researchers searched + the Google Play store for apps that may contain embedded deep learning models + by searching for deep learning related keywords. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{acquire_ml_artifacts_model.id}}' + description: 'The researchers acquired the apps'' APKs from the Google Play store. + + They filtered the list of potential target applications by searching the code + metadata for keywords related to TensorFlow or TFLite and their model binary formats + (.tf and .tflite). + + The models were extracted from the APKs using Apktool. + + ' +- tactic: '{{ml_model_access.id}}' + technique: '{{full_access.id}}' + description: 'This provided the researches with full access to the ML model, albeit + in compiled, binary form. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{develop_advml.id}}' + description: 'The researchers developed a novel approach to insert a backdoor into + a compiled model that can be activated with a visual trigger. They inject a "neural + payload" into the model that consists of a trigger detection network and conditional + logic. + + The trigger detector is trained to detect a visual trigger that will be placed + in the real world. + + The conditional logic allows the researchers to bypass the victim model when the + trigger is detected and provide model outputs of their choosing. + + The only requirements for training a trigger detector are a general + + dataset from the same modality as the target model (e.g. ImageNet for image classification) + and several photos of the desired trigger. + + ' +- tactic: '{{persistence.id}}' + technique: '{{poison_model.id}}' + description: 'The researchers poisoned the victim model by injecting the neural + + payload into the compiled models by directly modifying the computation + + graph. + + The researchers then repackage the poisoned model back into the APK + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{verify_attack.id}}' + description: To verify the success of the attack, the researchers confirmed the + app did not crash with the malicious model in place, and that the trigger detector + successfully detects the trigger. +- tactic: '{{initial_access.id}}' + technique: '{{supply_chain_model.id}}' + description: In practice, the malicious APK would need to be installed on victim's + devices via a supply chain compromise. +- tactic: '{{ml_attack_staging.id}}' + technique: '{{craft_adv_trigger.id}}' + description: 'The trigger is placed in the physical environment, where it is captured + by the victim''s device camera and processed by the backdoored ML model. + + ' +- tactic: '{{ml_model_access.id}}' + technique: '{{physical_env.id}}' + description: 'At inference time, only physical environment access is required to + trigger the attack. + + ' +- tactic: '{{impact.id}}' + technique: '{{evade_model.id}}' + description: 'Presenting the visual trigger causes the victim model to be bypassed. + + The researchers demonstrated this can be used to evade ML models in + + several safety-critical apps in the Google Play store. + + ' +reported-by: Neil Yale / YingZonghao (University of Chinese Academy of Sciences) +references: +- title: 'DeepPayload: Black-box Backdoor Attack on Deep Learning Models through Neural + Payload Injection' + url: https://arxiv.org/abs/2101.06896 diff --git a/data/case-studies/AML.CS0014.yaml b/data/case-studies/AML.CS0014.yaml new file mode 100644 index 0000000..d923d9c --- /dev/null +++ b/data/case-studies/AML.CS0014.yaml @@ -0,0 +1,111 @@ +--- +id: AML.CS0014 +name: Confusing Antimalware Neural Networks +object-type: case-study +summary: 'Cloud storage and computations have become popular platforms for deploying + ML malware detectors. + + In such cases, the features for models are built on users'' systems and then sent + to cybersecurity company servers. + + The Kaspersky ML research team explored this gray-box scenario and shown that feature + knowledge is enough for an adversarial attack on ML models. + + + They attacked one of Kaspersky''s antimalware ML models without white-box access + to it and successfully evaded detection for most of the adversarially modified malware + files. + + ' +incident-date: 2021-06-23 +incident-date-granularity: DATE +procedure: +- tactic: '{{reconnaissance.id}}' + technique: '{{vuln_analysis.id}}' + description: 'The researchers performed a review of adversarial ML attacks on antimalware + products. + + They discovered that techniques borrowed from attacks on image classifiers have + been successfully applied to the antimalware domain. + + However, it was not clear if these approaches were effective against the ML component + of production antimalware solutions. + + ' +- tactic: '{{reconnaissance.id}}' + technique: '{{victim_website.id}}' + description: 'Kaspersky''s use of ML-based antimalware detectors is publicly documented + on their website. In practice, an adversary could use this for targeting. + + ' +- tactic: '{{ml_model_access.id}}' + technique: '{{ml_service.id}}' + description: 'The researches used access to the target ML-based antimalware product + throughout this case study. + + This product scans files on the user''s system, extracts features locally, then + sends them to the cloud-based ML malware detector for classification. + + Therefore, the researchers had only black-box access to the malware detector itself, + but could learn valuable information for constructing the attack from the feature + extractor. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{acquire_ml_artifacts_data.id}}' + description: 'The researchers collected a dataset of malware and clean files. + + They scanned the dataset with the target ML-based antimalware solution and labeled + the samples according the ML detector''s predictions. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{train_proxy_model.id}}' + description: 'Then, a proxy model was trained on the labeled dataset of malware + and clean files. + + The researchers experimented with a variety of model architectures. + + ' +- tactic: '{{resource_development.id}}' + technique: '{{develop_advml.id}}' + description: 'By reverse engineering the local feature extractor, the researchers + could collect information about the input features, used for the cloud-based ML + detector. + + The model collects PE Header features, section features and section data statistics, + and file strings information. + + A gradient based adversarial algorithm for executable files was developed. + + The algorithm manipulates file features to avoid detection by the proxy model, + while still containing the same malware payload + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{craft_adv_transfer.id}}' + description: 'Using a developed gradient-driven algorithm, malicious adversarial + files for the proxy model were constructed from the malware files for black-box + transfer to the target model. + + ' +- tactic: '{{ml_attack_staging.id}}' + technique: '{{verify_attack.id}}' + description: 'The adversarial malware files were tested against the target antimalware + solution to verify their efficacy. + + ' +- tactic: '{{defense_evasion.id}}' + technique: '{{evade_model.id}}' + description: 'The researchers demonstrated that for most of the adversarial files, + the antimalware model was successfully evaded. + + In practice, an adversary could deploy their adversarially crafted malware and + infect systems while evading detection. + + ' +reported-by: 'Alexey Antonov and Alexey Kogtenkov (ML researchers, Kaspersky ML team) ' +references: +- title: Article, "How to confuse antimalware neural networks. Adversarial attacks + and protection" + url: https://securelist.com/how-to-confuse-antimalware-neural-networks-adversarial-attacks-and-protection/102949/ diff --git a/data/matrix.yaml b/data/matrix.yaml new file mode 100644 index 0000000..fb698ff --- /dev/null +++ b/data/matrix.yaml @@ -0,0 +1,24 @@ +--- + +id: ATLAS +name: ATLAS Machine Learning Threat Matrix +version: 3.0.0 + +tactics: + - "{{reconnaissance.id}}" + - "{{resource_development.id}}" + - "{{initial_access.id}}" + - "{{ml_model_access.id}}" + - "{{execution.id}}" + - "{{persistence.id}}" + - "{{defense_evasion.id}}" + - "{{discovery.id}}" + - "{{collection.id}}" + - "{{ml_attack_staging.id}}" + - "{{exfiltration.id}}" + - "{{impact.id}}" + +data: + - !include tactics.yaml + - !include techniques.yaml + - !include case-studies/*.yaml diff --git a/data/tactics.yaml b/data/tactics.yaml new file mode 100644 index 0000000..142ae97 --- /dev/null +++ b/data/tactics.yaml @@ -0,0 +1,138 @@ +--- + +- &ml_model_access + id: AML.TA0000 + name: ML Model Access + object-type: tactic + description: | + An adversary is attempting to gain some level of access to a machine learning model. + + ML Model Access consists of techniques that use various types of access to the machine learning model that can be used by the adversary to gain information, develop attacks, and as a means to input data to the model. + The level of access can range from the full knowledge of the internals of the model to access to the physical environment where data is collected for use in the machine learning model. + The adversary may use varying levels of model access during the course of their attack, from staging the attack to impacting the target system. + +- &ml_attack_staging + id: AML.TA0001 + name: ML Attack Staging + object-type: tactic + description: | + An adversary is leveraging their knowledge of and access to the target system to tailor the attack. + + ML Attack Staging consists of techniques adversaries use to prepare their attack on the target ML model. + Techniques can include training proxy models, poisoning the target model, and crafting adversarial data to feed the target model. + Some of these techniques can be performed in an offline manor and are thus difficult to mitigate. + These techniques are often used to achieve the adversary's end goal. + +- &reconnaissance + id: AML.TA0002 + name: Reconnaissance + object-type: tactic + description: | + The adversary is trying to gather information they can use to plan + future operations. + + Reconnaissance consists of techniques that involve adversaries actively or passively gathering information that can be used to support targeting. + Such information may include details of the victim organizations machine learning capabilities and research efforts. + This information can be leveraged by the adversary to aid in other phases of the adversary lifecycle, such as using gathered information to obtain relevant ML artifacts, targeting ML capabilities used by the victim, tailoring attacks to the particular models used by the victim, or to drive and lead further Reconnaissance efforts. + +- &resource_development + id: AML.TA0003 + name: Resource Development + object-type: tactic + description: | + The adversary is trying to establish resources they can use to support operations. + + Resource Development consists of techniques that involve adversaries creating, + purchasing, or compromising/stealing resources that can be used to support targeting. + Such resources include machine learning artifacts, infrastructure, accounts, or capabilities. + These resources can be leveraged by the adversary to aid in other phases of the adversary lifecycle, such as ML Attack Staging. + +- &initial_access + id: AML.TA0004 + name: Initial Access + object-type: tactic + description: | + The adversary is trying to gain access to the system containing machine learning artifacts. + + The target system could be a network, mobile device, or an edge device such as a sensor platform. + The machine learning capabilities used by the system could be local with onboard or cloud enabled ML capabilities. + + Initial Access consists of techniques that use various entry vectors to gain their initial foothold within the system. + +- &execution + id: AML.TA0005 + name: Execution + object-type: tactic + description: | + The adversary is trying to run malicious code. + + Execution consists of techniques that result in adversary-controlled code running on a local or remote system. + Techniques that run malicious code are often paired with techniques from all other tactics to achieve broader goals, like exploring a network or stealing data. + For example, an adversary might use a remote access tool to run a PowerShell script that does Remote System Discovery. + +- &persistence + id: AML.TA0006 + name: Persistence + object-type: tactic + description: | + The adversary is trying to maintain their foothold. + + Persistence consists of techniques that adversaries use to keep access to systems across restarts, changed credentials, and other interruptions that could cut off their access. + Techniques used for persistence often involve leaving behind modified ML artifacts such as poisoned training data or backdoored ML models. + +- &defense_evasion + id: AML.TA0007 + name: Defense Evasion + object-type: tactic + description: | + The adversary is trying to avoid being detected by security software. + + Defense Evasion consists of techniques that adversaries use to avoid detection throughout their compromise. + Techniques used for defense evasion include evading ML-enabled security software such as malware detectors. + +- &discovery + id: AML.TA0008 + name: Discovery + object-type: tactic + description: | + The adversary is trying to figure out your environment. + + Discovery consists of techniques an adversary may use to gain knowledge about the system and internal network. + These techniques help adversaries observe the environment and orient themselves before deciding how to act. + They also allow adversaries to explore what they can control and what's around their entry point in order to discover how it could benefit their current objective. + Native operating system tools are often used toward this post-compromise information-gathering objective. + +- &collection + id: AML.TA0009 + name: Collection + object-type: tactic + description: | + The adversary is trying to gather ML artifacts and other related information relevant to their goal. + + Collection consists of techniques adversaries may use to gather information and the sources information is collected from that are relevant to following through on the adversary's objectives. + Frequently, the next goal after collecting data is to steal (exfiltrate) the ML artifacts, or use the collected information to stage future operations. + Common target sources include software repositories, container registries, model repositories, and object stores. + +- &exfiltration + id: AML.TA0010 + name: Exfiltration + object-type: tactic + description: | + The adversary is trying to steal machine learning artifacts. + + Exfiltration consists of techniques that adversaries may use to steal data from your network. + Data may be stolen for it's valuable intellectual property, or for use in staging future operations. + + Techniques for getting data out of a target network typically include transferring it over their command and control channel or an alternate channel and may also include putting size limits on the transmission. + +- &impact + id: AML.TA0011 + name: Impact + object-type: tactic + description: | + The adversary is trying to manipulate, interrupt, erode confidence in, or destroy your systems and data. + + Impact consists of techniques that adversaries use to disrupt availability or compromise integrity by manipulating business and operational processes. + Techniques used for impact can include destroying or tampering with data. + In some cases, business processes can look fine, but may have been altered to benefit the adversaries' goals. + These techniques might be used by adversaries to follow through on their end goal or to provide cover for a confidentiality breach. diff --git a/data/techniques.yaml b/data/techniques.yaml new file mode 100644 index 0000000..00cc098 --- /dev/null +++ b/data/techniques.yaml @@ -0,0 +1,766 @@ +--- + +# Stylistic notes: +# - keep keys in a consistent order: id, name, object-type, description, tactics, subtechnque-of +# - create an anchor for each technique +# - use the literal block style (|) for the description +# - the description text is interpreted as markdown +# - use a new line after each sentence in the description +# - use the block list format for the list of tactic ids +# - use the anchor references for ids and names wherever possible + +# Stub technique object for adding new techniques. +# Copy and paste this section, then increment the id. +# +# - &short_name +# id: AML.T0049 +# name: Example Technique +# description: | +# The description of the technique. +# tactics: +# - "{{tactic_short_name.id}}" +# subtechnique-of: "{{parent_short_name}}" + +- &victim_research + id: AML.T0000 + name: Search for Victim's Publicly Available Research Materials + object-type: technique + description: | + Adversaries may search publicly available research to learn how and where machine learning is used within a victim organization. + The adversary can use this information to identify targets for attack, or to tailor an existing attack to make it more effective. + Organizations often use open source model architectures trained on additional proprietary data in production. + Knowledge of this underlying architecture allows the adversary to craft more realistic proxy models ([{{train_proxy_model.name}}](/techniques/{{train_proxy_model.id}})). + An adversary can search these resources for publications for authors employed at the victim organization. + + Research materials may exist as academic papers published in [{{victim_research_journals.name}}](/techniques/{{victim_research_journals.id}}), or stored in [{{victim_research_preprint.name}}](/techniques/{{victim_research_preprint.id}}), as well as [{{victim_research_blogs.name}}](/techniques/{{victim_research_blogs.id}}). + tactics: + - "{{reconnaissance.id}}" + +- &victim_research_journals + id: AML.T0000.000 + name: Journals and Conference Proceedings + object-type: technique + description: | + Many of the publications accepted at premier machine learning conferences and journals come from commercial labs. + Some journals and conferences are open access, others may require paying for access or a membership. + These publications will often describe in detail all aspects of a particular approach for reproducibility. + This information can be used by adversaries to implement the paper. + subtechnique-of: "{{victim_research.id}}" + +- &victim_research_preprint + id: AML.T0000.001 + name: Pre-Print Repositories + object-type: technique + description: | + Pre-Print repositories, such as arXiv, contain the latest academic research papers that haven't been peer reviewed. + They may contain research notes, or technical reports that aren't typically published in journals or conference proceedings. + Pre-print repositories also serve as a central location to share papers that have been accepted to journals. + Searching pre-print repositories provide adversaries with a relatively up-to-date view of what researchers in the victim organization are working on. + subtechnique-of: "{{victim_research.id}}" + +- &victim_research_blogs + id: AML.T0000.002 + name: Technical Blogs + object-type: technique + description: | + Research labs at academic institutions and Company R&D divisions often have blogs that highlight their use of machine learning and its application to the organizations unique problems. + Individual researchers also frequently document their work in blogposts. + An adversary may search for posts made by the target victim organization or its employees. + In comparison to [{{victim_research_journals.name}}](/techniques/{{victim_research_journals.id}}) and [{{victim_research_preprint.name}}](/techniques/{{victim_research_preprint.id}}) this material will often contain more practical aspects of the machine learning system. + This could include underlying technologies and frameworks used, and possibly some information about the API access and use case. + This will help the adversary better understand how that organization is using machine learning internally and the details of their approach that could aid in tailoring an attack. + subtechnique-of: "{{victim_research.id}}" + +- &vuln_analysis + id: AML.T0001 + name: Search for Publicly Available Adversarial Vulnerability Analysis + object-type: technique + description: + Much like the [{{victim_research.name}}](/techniques/{{victim_research.id}}), there is often ample research available on the vulnerabilities of common models. + Once a target has been identified, an adversary will likely try to identify any pre-existing work that has been done for this class of models. + + This will include not only reading academic papers that may identify the particulars of a successful attack, but also identifying pre-existing implementations of those attacks. + The adversary may [{{obtain_advml.name}}](/techniques/{{obtain_advml.id}}) or [{{develop_advml.name}}](/techniques/{{develop_advml.id}}) their own if necessary. + tactics: + - "{{reconnaissance.id}}" + +- &victim_website + id: AML.T0003 + name: Search Victim-Owned Websites + object-type: technique + description: | + Adversaries may search websites owned by the victim for information that can be used during targeting. + Victim-owned websites may contain technical details about their ML-enabled products or services. + Victim-owned websites may contain a variety of details, including names of departments/divisions, physical locations, and data about key employees such as names, roles, and contact info. + These sites may also have details highlighting business operations and relationships. + + Adversaries may search victim-owned websites to gather actionable information. + This information may help adversaries tailor their attacks (e.g. [{{develop_advml.name}}](/techniques/{{develop_advml.id}}) or [{{craft_adv_manual.name}}](/techniques/{{craft_adv_manual.id}})). + Information from these sources may reveal opportunities for other forms of reconnaissance (e.g. [{{victim_research.name}}](/techniques/{{victim_research.id}}) or [{{vuln_analysis.name}}](/techniques/{{vuln_analysis.id}})) + tactics: + - "{{reconnaissance.id}}" + +- &search_apps + id: AML.T0004 + name: Search Application Repositories + object-type: technique + description: | + Adversaries may search open application repositories during targeting. + Examples of these include Google Play, the iOS App store, the macOS App Store, and the Microsoft Store. + + Adversaries may craft search queries seeking applications that contain a ML-enabled components. + Frequently, the next step is to [{{acquire_ml_artifacts.name}}](/techniques/{{acquire_ml_artifacts.id}}). + tactics: + - "{{reconnaissance.id}}" + +- &active_scanning + id: AML.T0006 + name: Active Scanning + object-type: technique + description: | + An adversary may probe or scan the victim system to gather information for targeting. + This is distinct from other reconnaissance techniques that do not involve direct interaction with the victim system. + tactics: + - "{{reconnaissance.id}}" + +- &acquire_ml_artifacts + id: AML.T0002 + name: Acquire Public ML Artifacts + object-type: technique + description: | + Adversaries may search public sources, including cloud storage, public-facing services, and software or data repositories, to identify machine learning artifacts. + These machine learning artifacts may include the software stack used to train and deploy models, training and testing data, model configurations and parameters. + An adversary will be particularly interested in artifacts hosted by or associated with the victim organization as they may represent what that organization uses in a production environment. + Adversaries may identify artifact repositories via other resources associated with the victim organization (e.g. [{{victim_website.name}}](/techniques/{{victim_website.id}}) or [{{victim_research.name}}](/techniques/{{victim_research.id}})). + These ML artifacts often provide adversaries with details of the ML task and approach. + + ML artifacts can aid in an adversary's ability to [{{train_proxy_model.name}}](/techniques/{{train_proxy_model.id}}). + If these artifacts include pieces of the actual model in production, they can be used to directly [{{craft_adv.name}}](/techniques/{{craft_adv.id}}). + Acquiring some artifacts requires registration (providing user details such email/name), AWS keys, or written requests, and may require the adversary to [{{establish_accounts.name}}](/techniques/{{establish_accounts.id}}). + + Artifacts might be hosted on victim-controlled infrastructure, providing the victim with some information on who has accessed that data. + tactics: + - "{{resource_development.id}}" + +- &acquire_ml_artifacts_data + id: AML.T0002.000 + name: Datasets + object-type: technique + description: | + Adversaries may collect public datasets to use in their operations. + Datasets used by the victim organization or datasets that are representative of the data used by the victim organization may be valuable to adversaries. + Datasets can be stored in cloud storage, or on victim-owned websites. + Some datasets require the adversary to [{{establish_accounts.name}}](/techniques/{{establish_accounts.id}}) for access. + + Acquired datasets help the adversary advance their operations, stage attacks, and tailor attacks to the victim organization. + subtechnique-of: "{{acquire_ml_artifacts.id}}" + +- &acquire_ml_artifacts_model + id: AML.T0002.001 + name: Models + object-type: technique + description: | + Adversaries may acquire public models to use in their operations. + Adversaries may seek models used by the victim organization or models that are representative of those used by the victim organization. + Representative models may include model architectures, or pre-trained models which define the architecture as well as model parameters from training on a dataset. + The adversary may search public sources for common model architecture configuration file formats such as yaml or python configuration files, and common model storage file formats such as ONNX (.onnx), HDF5 (.h5), Pickle (.pkl), PyTorch (.pth), or TensorFlow (.pb, .tflite). + + Acquired models are useful in advancing the adversary's operations and are frequently used to tailor attacks to the victim model. + subtechnique-of: "{{acquire_ml_artifacts.id}}" + +- &obtain_cap + id: AML.T0016 + name: Obtain Capabilities + object-type: technique + description: + Adversaries may search for and obtain software capabilities for use in their operations. + + Capabilities may be specific to ML-based attacks [{{obtain_advml.name}}](/techniques/{{obtain_advml.id}}) or generic software tools repurposed for malicious intent ([{{obtain_tool.name}}](/techniques/{{obtain_tool.id}})). + In both instances, an adversary may modify or customize the capability to aid in targeting a particular ML system. + tactics: + - "{{resource_development.id}}" + +- &obtain_advml + id: AML.T0016.000 + name: Adversarial ML Attack Implementations + object-type: technique + description: + Adversaries may search for existing open source implementations of machine learning attacks. + The research community often publishes their code for reproducibility and to further future research. + Libraries intended for research purposes, such as CleverHans, the Adversarial Robustness Toolbox, and FoolBox, can be weaponized by an adversary. + Adversaries may also obtain and use tools that were not originally designed for adversarial ML attacks as part of their attack. + subtechnique-of: "{{obtain_cap.id}}" + +- &obtain_tool + id: AML.T0016.001 + name: Software Tools + object-type: technique + description: > + Adversaries may search for and obtain software tools to support their operations. + Software designed for legitimate use may be repurposed by an adversary for malicious intent. + An adversary may modify or customize software tools to achieve their purpose. + Software tools used to support attacks on ML systems are not necessarily ML-based themselves. + subtechnique-of: "{{obtain_cap.id}}" + +- &develop_advml + id: AML.T0017 + name: "Develop Adversarial ML Attack Capabilities" + object-type: technique + description: + Adversaries may develop their own adversarial attacks. + They may leverage existing libraries as a starting point ([{{obtain_advml.name}}](/techniques/{{obtain_advml.id}})). + They may implement ideas described in public research papers or develop custom made attacks for the victim model. + tactics: + - "{{resource_development.id}}" + +- &acquire_infra + id: AML.T0008 + name: Acquire Infrastructure + object-type: technique + description: | + Adversaries may buy, lease, or rent infrastructure for use throughout their operation. + A wide variety of infrastructure exists for hosting and orchestrating adversary operations. + Infrastructure solutions include physical or cloud servers, domains, mobile devices, and third-party web services. + Free resources may also be used, but they are typically limited. + + Use of these infrastructure solutions allows an adversary to stage, launch, and execute an operation. + Solutions may help adversary operations blend in with traffic that is seen as normal, such as contact to third-party web services. + Depending on the implementation, adversaries may use infrastructure that makes it difficult to physically tie back to them as well as utilize infrastructure that can be rapidly provisioned, modified, and shut down. + tactics: + - "{{resource_development.id}}" + +- &acquire_workspaces + id: AML.T0008.000 + name: ML Development Workspaces + object-type: technique + description: | + Developing and staging machine learning attacks often requires expensive compute resources. + Adversaries may need access to one or many GPUs in order to develop an attack. + They may try to anonymously use free resources such as Google Colaboratory, or cloud resources such as AWS, Azure, or Google Cloud as an efficient way to stand up temporary resources to conduct operations. + Multiple workspaces may be used to avoid detection. + subtechnique-of: "{{acquire_infra.id}}" + +- &acquire_hw + id: AML.T0008.001 + name: Consumer Hardware + object-type: technique + description: | + Adversaries may acquire consumer hardware to conduct their attacks. + Owning the hardware provides the adversary with complete control of the environment. These devices can be hard to trace. + subtechnique-of: "{{acquire_infra.id}}" + +- &publish_poisoned_data + id: AML.T0019 + name: Publish Poisoned Datasets + object-type: technique + description: | + Adversaries may [{{poison_data.name}}](/techniques/{{poison_data.id}}) and publish it to a public location. + The poisoned dataset may be a novel dataset or a poisoned variant of an existing open source dataset. + This data may be introduced to a victim system via [{{supply_chain.name}}](/techniques/{{supply_chain.id}}). + tactics: + - "{{resource_development.id}}" + +- &supply_chain + id: AML.T0010 + name: ML Supply Chain Compromise + object-type: technique + description: | + Adversaries may gain initial access to a system by compromising the unique portions of the ML supply chain. + This could include [{{supply_chain_gpu.name}}](/techniques/{{supply_chain_gpu.id}}), [{{supply_chain_data.name}}](/techniques/{{supply_chain_data.id}}) and its annotations, parts of the ML [{{supply_chain_software.name}}](/techniques/{{supply_chain_software.id}}) stack, or the [{{supply_chain_model.name}}](/techniques/{{supply_chain_model.id}}) itself. + In some instances the attacker will need secondary access to fully carry out an attack using compromised components of the supply chain. + tactics: + - "{{initial_access.id}}" + +- &supply_chain_gpu + id: AML.T0010.000 + name: GPU Hardware + object-type: technique + description: | + Most machine learning systems require access to certain specialized hardware, typically GPUs. + Adversaries can target machine learning systems by specifically targeting the GPU supply chain. + subtechnique-of: "{{supply_chain.id}}" + +- &supply_chain_software + id: AML.T0010.001 + name: ML Software + object-type: technique + description: | + Most machine learning systems rely on a limited set of machine learning frameworks. + An adversary could get access to a large number of machine learning systems through a comprise of one of their supply chains. + Many machine learning projects also rely on other open source implementations of various algorithms. + These can also be compromised in a targeted way to get access to specific systems. + subtechnique-of: "{{supply_chain.id}}" + +- &supply_chain_data + id: AML.T0010.002 + name: Data + object-type: technique + description: | + Data is a key vector of supply chain compromise for adversaries. + Every machine learning project will require some form of data. + Many rely on large open source datasets that are publicly available. + An adversary could rely on compromising these sources of data. + The malicious data could be a result of [{{poison_data.name}}](/techniques/{{poison_data.id}}) or include traditional malware. + + An adversary can also target private datasets in the labeling phase. + The creation of private datasets will often require the hiring of outside labeling services. + An adversary can poison a dataset by modifying the labels being generated by the labeling service. + subtechnique-of: "{{supply_chain.id}}" + +- &supply_chain_model + id: AML.T0010.003 + name: Model + object-type: technique + description: | + Machine learning systems often rely on open sourced models in various ways. + Most commonly, the victim organization may be using these models for fine tuning. + These models will be downloaded from an external source and then used as the base for the model as it is tuned on a smaller, private dataset. + Loading models often requires executing some saved code in the form of a saved model file. + These can be compromised with traditional malware, or through some adversarial machine learning techniques. + subtechnique-of: "{{supply_chain.id}}" + +- &inference_api + id: AML.T0040 + name: ML Model Inference API Access + object-type: technique + description: | + Adversaries may gain access to a model via legitimate access to the inference API. + Inference API access can be a source of information to the adversary ([{{discover_model_ontology.name}}](/techniques/{{discover_model_ontology.id}}), [{{discover_model_family.name}}](/techniques/{{discover_model_ontology.id}})), a means of staging the attack ([{{verify_attack.name}}](/techniques/{{verify_attack.id}}), [{{craft_adv.name}}](/techniques/{{craft_adv.id}})), or for introducing data to the target system for Impact ([{{evade_model.name}}](/techniques/{{evade_model.id}}), [{{erode_integrity.name}}](/techniques/{{erode_integrity.id}})). + tactics: + - "{{ml_model_access.id}}" + +- &ml_service + id: AML.T0047 + name: ML-Enabled Product or Service + object-type: technique + description: | + Adversaries may use a product or service that uses machine learning under the hood to gain access to the underlying machine learning model. + This type of indirect model access may reveal details of the ML model or its inferences in logs or metadata. + tactics: + - "{{ml_model_access.id}}" + +- &physical_env + id: AML.T0041 + name: Physical Environment Access + object-type: technique + description: | + In addition to the attacks that take place purely in the digital domain, adversaries may also exploit the physical environment for their attacks. + If the model is interacting with data collected from the real world in some way, the adversary can influence the model through access to wherever the data is being collected. + By modifying the data in the collection process, the adversary can perform modified versions of attacks designed for digital access. + tactics: + - "{{ml_model_access.id}}" + +- &full_access + id: AML.T0044 + name: Full ML Model Access + object-type: technique + description: | + Adversaries may gain full "white-box" access to a machine learning model. + This means the adversary has complete knowledge of the model architecture, its parameters, and class ontology. + They may exfiltrate the model to [{{craft_adv.name}}](/techniques/{{craft_adv.id}}) and [{{verify_attack.name}}](/techniques/{{verify_attack.id}}) in an offline where it is hard to detect their behavior. + tactics: + - "{{ml_model_access.id}}" + +- &discover_model_ontology + id: AML.T0013 + name: Discover ML Model Ontology + object-type: technique + description: | + Adversaries may discover the ontology of a machine learning model's output space, for example, the types of objects a model can detect. + The adversary may discovery the ontology by repeated queries to the model, forcing it to enumerate its output space. + Or the ontology may be discovered in a configuration file or in documentation about the model. + + The model ontology helps the adversary understand how the model is being used by the victim. + It is useful to the adversary in creating targeted attacks. + tactics: + - "{{discovery.id}}" + +- &discover_model_family + id: AML.T0014 + name: Discover ML Model Family + object-type: technique + description: | + Adversaries may discover the general family of model. + General information about the model may be revealed in documentation, or the adversary may used carefully constructed examples and analyze the model's responses to categorize it. + + Knowledge of the model family can help the adversary identify means of attacking the model and help tailor the attack. + tactics: + - "{{discovery.id}}" + +- &poison_data + id: AML.T0020 + name: Poison Training Data + object-type: technique + description: | + Adversaries may attempt to poison datasets used by a ML model by modifying the underlying data or its labels. + This allows the adversary to embed vulnerabilities in ML models trained on the data that may not be easily detectable. + Data poisoning attacks may or may not require modifying the labels. + The embedded vulnerability is activated at a later time by data samples with an [{{craft_adv_trigger.name}}](/techniques/{{craft_adv_trigger.id}}) + + Poisoned data can be introduced via [{{supply_chain.name}}](/techniques/{{supply_chain.id}}) or the data may be poisoned after the adversary gains [{{initial_access.name}}](/tactics/{{initial_access.id}}) to the system. + tactics: + - "{{resource_development.id}}" + - "{{persistence.id}}" + +- &establish_accounts + id: AML.T0021 + name: Establish Accounts + object-type: technique + description: | + Adversaries may create accounts with various services for use in targeting, to gain access to resources needed in [{{ml_attack_staging.name}}](/tactics/{{ml_attack_staging.id}}), or for victim impersonation. + tactics: + - "{{resource_development.id}}" + +- &train_proxy_model + id: AML.T0005 + name: Create Proxy ML Model + object-type: technique + description: | + Adversaries may obtain models to serve as proxies for the target model in use at the victim organization. + Proxy models are used to simulate complete access to the target model in a fully offline manner. + + Adversaries may train models from representative datasets, attempt to replicate models from victim inference APIs, or use available pre-trained models. + tactics: + - "{{ml_attack_staging.id}}" + +- &proxy_via_artifacts + id: AML.T0005.000 + name: Train Proxy via Gathered ML Artifacts + object-type: technique + description: | + Proxy models may be trained from ML artifacts (such as data, model architectures, and pre-trained models) that are representative of the target model gathered by the adversary. + This can be used to develop attacks that require higher levels of access than the adversary has available or as a means to validate pre-existing attacks without interacting with the target model. + subtechnique-of: "{{train_proxy_model.id}}" + +- &replicate_model + id: AML.T0005.001 + name: Train Proxy via Replication + object-type: technique + description: | + Adversaries may replicate a private model. + By repeatedly querying the victim's [{{inference_api.name}}](/techniques/{{inference_api.id}}), the adversary can collect the target model's inferences into a dataset. + The inferences are used as labels for training a separate model offline that will mimic the behavior and performance of the target model. + + A replicated model that closely mimic's the target model is a valuable resource in staging the attack. + The adversary can use the replicated model to [{{craft_adv.name}}](/techniques/{{craft_adv.id}}) for various purposes (e.g. [{{evade_model.name}}](/techniques/{{evade_model.id}}), [{{chaff_data.name}}](/techniques/{{chaff_data.id}})). + subtechnique-of: "{{train_proxy_model.id}}" + +- &pretrained_proxy + id: AML.T0005.002 + name: Use Pre-Trained Model + object-type: technique + description: | + Adversaries may use an off-the-shelf pre-trained model as a proxy for the victim model to aid in staging the attack. + subtechnique-of: "{{train_proxy_model.id}}" + +- &discover_ml_artifacts + id: AML.T0007 + name: Discover ML Artifacts + object-type: technique + description: | + Adversaries may search private sources to identify machine learning artifacts that exist on the system and gather information about them. + These artifacts can include the software stack used to train and deploy models, training and testing data management systems, container registries, software repositories, and model zoos. + + This information can be used to identify targets for further collection, exfiltration, or disruption, and to tailor and improve attacks. + tactics: + - "{{discovery.id}}" + +- &user_execution + id: AML.T0011 + name: User Execution + object-type: technique + description: | + An adversary may rely upon specific actions by a user in order to gain execution. + Users may inadvertently execute unsafe code introduced via [{{supply_chain.name}}](/techniques/{{supply_chain.id}}). + Users may be subjected to social engineering to get them to execute malicious code by, for example, opening a malicious document file or link. + tactics: + - "{{execution.id}}" + +- &unsafe_ml_artifacts + id: AML.T0011.000 + name: Unsafe ML Artifacts + object-type: technique + description: | + Adversaries may develop unsafe ML artifacts that when executed have a deleterious effect. + The adversary can use this technique to establish persistent access to systems. + These models may be introduced via a [{{supply_chain.name}}](/techniques/{{supply_chain.id}}). + + Serialization of models is a popular technique for model storage, transfer, and loading. + However, this format without proper checking presents an opportunity for code execution. + subtechnique-of: "{{user_execution.id}}" + +- &valid_accounts + id: AML.T0012 + name: Valid Accounts + object-type: technique + description: | + Adversaries may obtain and abuse credentials of existing accounts as a means of gaining Initial Access. + Credentials may take the form of usernames and passwords of individual user accounts or API keys that provide access to various ML resources and services. + + Compromised credentials may provide access to additional ML artifacts and allow the adversary to perform [{{discover_ml_artifacts.name}}](/techniques/{{discover_ml_artifacts.id}}). + Compromised credentials may also grant and adversary increased privileges such as write access to ML artifacts used during development or production. + tactics: + - "{{initial_access.id}}" + +- &evade_model + id: AML.T0015 + name: Evade ML Model + object-type: technique + description: | + Adversaries can [{{craft_adv.name}}](/techniques/{{craft_adv.id}}) that prevent a machine learning model from correctly identifying the contents of the data. + This technique can be used to evade a downstream task where machine learning is utilized. + The adversary may evade machine learning based virus/malware detection, or network scanning towards the goal of a traditional cyber attack. + tactics: + - "{{defense_evasion.id}}" + - "{{impact.id}}" + +- &backdoor_model + id: AML.T0018 + name: Backdoor ML Model + object-type: technique + description: | + Adversaries may introduce a backdoor into a ML model. + A backdoored model operates performs as expected under typical conditions, but will produce the adversary's desired output when a trigger is introduced to the input data. + A backdoored model provides the adversary with a persistent artifact on the victim system. + The embedded vulnerability is typically activated at a later time by data samples with an [{{craft_adv_trigger.name}}](/techniques/{{craft_adv_trigger.id}}) + tactics: + - "{{persistence.id}}" + - "{{ml_attack_staging.id}}" + +- &poison_model + id: AML.T0018.000 + name: Poison ML Model + object-type: technique + description: | + Adversaries may introduce a backdoor by training the model poisoned data, or by interfering with its training process. + The model learns to associate a adversary defined trigger with the adversary's desired output. + subtechnique-of: "{{backdoor_model.id}}" + +- &inject_payload + id: AML.T0018.001 + name: Inject Payload + object-type: technique + description: | + Adversaries may introduce a backdoor into a model by injecting a payload into the model file. + The payload detects the presence of the trigger and bypasses the model, instead producing the adversary's desired output. + subtechnique-of: "{{backdoor_model.id}}" + +- &exfiltrate_via_api + id: AML.T0024 + name: Exfiltration via ML Inference API + object-type: technique + description: | + Adversaries may exfiltrate private information via [{{inference_api.name}}](/techniques/{{inference_api.id}}). + ML Models have been shown leak private information about their training data (e.g. [{{membership_inference.name}}](/techniques/{{membership_inference.id}}), [{{model_inversion.name}}](/techniques/{{model_inversion.id}})). + The model itself may also be extracted ([{{extract_model.name}}](/techniques/{{extract_model.id}})) for the purposes of [{{ip_theft.name}}](/techniques/{{ip_theft.id}}). + + Exfiltration of information relating to private training data raises privacy concerns. + Private training data may include personally identifiable information, or other protected data. + tactics: + - "{{exfiltration.id}}" + +- &membership_inference + id: AML.T0024.000 + name: Infer Training Data Membership + object-type: technique + description: | + Adversaries may infer the membership of a data sample in its training set, which raises privacy concerns. + Some strategies make use of a shadow model that could be obtained via [{{replicate_model.name}}](/techniques/{{replicate_model.id}}), others use statistics of model prediction scores. + + This can cause the victim model to leak private information, such as PII of those in the training set or other forms of protected IP. + subtechnique-of: "{{exfiltrate_via_api.id}}" + +- &model_inversion + id: AML.T0024.001 + name: Invert ML Model + object-type: technique + description: | + Machine learning models' training data could be reconstructed by exploiting the confidence scores that are available via an inference API. + By querying the inference API strategically, adversaries can back out potentially private information embedded within the training data. + This could lead to privacy violations if the attacker can reconstruct the data of sensitive features used in the algorithm. + subtechnique-of: "{{exfiltrate_via_api.id}}" + +- &extract_model + id: AML.T0024.002 + name: Extract ML Model + object-type: technique + description: | + Adversaries may extract a functional copy of a private model. + By repeatedly querying the victim's [{{inference_api.name}}](/techniques/{{inference_api.id}}), the adversary can collect the target model's inferences into a dataset. + The inferences are used as labels for training a separate model offline that will mimic the behavior and performance of the target model. + + Adversaries may extract the model to avoid paying per query in a machine learning as a service setting. + Model extraction is used for [{{ip_theft.name}}](/techniques/{{ip_theft.id}}). + subtechnique-of: "{{exfiltrate_via_api.id}}" + +- &exfiltrate_via_cyber + id: AML.T0025 + name: Exfiltration via Cyber Means + object-type: technique + description: | + Adversaries may exfiltrate ML artifacts or other information relevant to their goals via traditional cyber means. + + See the ATT&CK [Exfiltration](https://attack.mitre.org/tactics/TA0010/) tactic for more information. + tactics: + - "{{exfiltration.id}}" + +- &ml_dos + id: AML.T0029 + name: Denial of ML Service + object-type: technique + description: | + Adversaries may target machine learning systems with a flood of requests for the purpose of degrading or shutting down the service. + Since many machine learning systems require significant amounts of specialized compute, they are often expensive bottlenecks that can become overloaded. + Adversaries can intentionally craft inputs that require heavy amounts of useless compute from the machine learning system. + tactics: + - "{{impact.id}}" + +- &chaff_data + id: AML.T0046 + name: Spamming ML System with Chaff Data + object-type: technique + description: | + Adversaries may spam the machine learning system with chaff data that causes increase in the number of detections. + This can cause analysts at the victim organization to waste time reviewing and correcting incorrect inferences. + tactics: + - "{{impact.id}}" + +- &erode_integrity + id: AML.T0031 + name: Erode ML Model Integrity + object-type: technique + description: | + Adversaries may degrade the target model's performance with adversarial data inputs to erode confidence in the system over time. + This can lead to the victim organization wasting time and money both attempting to fix the system and performing the tasks it was meant to automate by hand. + tactics: + - "{{impact.id}}" + +- &cost_harvesting + id: AML.T0034 + name: Cost Harvesting + object-type: technique + description: | + Adversaries may target different machine learning services to send useless queries or computationally expensive inputs to increase the cost of running services at the victim organization. + Sponge examples are a particular type of adversarial data designed to maximize energy consumption and thus operating cost. + tactics: + - "{{impact.id}}" + +- &ml_artifact_collection + id: AML.T0035 + name: ML Artifact Collection + object-type: technique + description: | + Adversaries may collect ML artifacts for [{{exfiltration.name}}](/tactics/{{exfiltration.id}}) or for use in [{{ml_attack_staging.name}}](/tactics/{{ml_attack_staging.id}}). + ML artifacts include models and datasets as well as other telemetry data produced when interacting with a model. + tactics: + - "{{collection.id}}" + +- &info_repos + id: AML.T0036 + name: Data from Information Repositories + object-type: technique + description: | + Adversaries may leverage information repositories to mine valuable information. + Information repositories are tools that allow for storage of information, typically to facilitate collaboration or information sharing between users, and can store a wide variety of data that may aid adversaries in further objectives, or direct access to the target information. + + Information stored in a repository may vary based on the specific instance or environment. + Specific common information repositories include Sharepoint, Confluence, and enterprise databases such as SQL Server. + tactics: + - "{{collection.id}}" + +- &verify_attack + id: AML.T0042 + name: Verify Attack + object-type: technique + description: | + Adversaries can verify the efficacy of their attack via an inference API or access to an offline copy of the target model. + This gives the adversary confidence that their approach works and allows them to carry out the attack at a later time of their choosing. + The adversary may verify the attack once but use it against many edge devices running copies of the target model. + The adversary may verify their attack digitally, then deploy it in the [{{physical_env.name}}](/techniques/{{physical_env.id}}) at a later time. + Verifying the attack may be hard to detect since the adversary can use a minimal number of queries or an offline copy of the model. + tactics: + - "{{ml_attack_staging.id}}" + +- &craft_adv + id: AML.T0043 + name: Craft Adversarial Data + object-type: technique + description: | + Adversarial data are inputs to a machine learning model that have been modified such that they cause the adversary's desired effect in the target model. + Effects can range from misclassification, to missed detections, to maximising energy consumption. + Typically, the modification is constrained in magnitude or location so that a human still perceives the data as if it were unmodified, but human perceptibility may not always be a concern depending on the adversary's intended effect. + For example, an adversarial input for an image classification task is an image the machine learning model would misclassify, but a human would still recognize as containing the correct class. + + Depending on the adversary's knowledge of and access to the target model, the adversary may use different classes of algorithms to develop the adversarial example such as [{{craft_adv_whitebox.name}}](/techniques/{{craft_adv_whitebox.id}}), [{{craft_adv_blackbox.name}}](/techniques/{{craft_adv_blackbox.id}}), [{{craft_adv_transfer.name}}](/techniques/{{craft_adv_transfer.id}}), or [{{craft_adv_manual.name}}](/techniques/{{craft_adv_manual.id}}). + + The adversary may [{{verify_attack.name}}](/techniques/{{verify_attack.id}}) their approach works if they have white-box or inference API access to the model. + This allows the adversary to gain confidence their attack is effective "live" environment where their attack may be noticed. + They can then use the attack at a later time to accomplish their goals. + An adversary may optimize adversarial examples for [{{evade_model.name}}](/techniques/{{evade_model.id}}), or to [{{erode_integrity.name}}](/techniques/{{erode_integrity.id}}). + tactics: + - "{{ml_attack_staging.id}}" + +- &craft_adv_whitebox + id: AML.T0043.000 + name: White-Box Optimization + object-type: technique + description: | + In White-Box Optimization, the adversary has full access to the target model and optimizes the adversarial example directly. + Adversarial examples trained in this manor are most effective against the target model. + subtechnique-of: "{{craft_adv.id}}" + +- &craft_adv_blackbox + id: AML.T0043.001 + name: Black-Box Optimization + object-type: technique + description: | + In Black-Box attacks, the adversary has black-box (i.e. [{{inference_api.name}}](/techniques/{{inference_api.id}}) via API access) access to the target model. + With black-box attacks, the adversary may be using an API that the victim is monitoring. + These attacks are generally less effective and require more inferences than [{{craft_adv_whitebox.name}}](/techniques/{{craft_adv_whitebox.id}}) attacks, but they require much less access. + subtechnique-of: "{{craft_adv.id}}" + +- &craft_adv_transfer + id: AML.T0043.002 + name: Black-Box Transfer + object-type: technique + description: | + In Black-Box Transfer attacks, the adversary uses one or more proxy models (trained via [{{train_proxy_model.name}}](/techniques/{{train_proxy_model.id}}) or [{{replicate_model.name}}](/techniques/{{replicate_model.id}})) models they have full access to and are representative of the target model. + The adversary uses [{{craft_adv_whitebox.name}}](/techniques/{{craft_adv_whitebox.id}}) on the proxy models to generate adversarial examples. + If the set of proxy models are close enough to the target model, the adversarial example should generalize from one to another. + This means that an attack that works for the proxy models will likely then work for the target model. + If the adversary has [{{inference_api.name}}](/techniques/{{inference_api.id}}), they may use this [{{verify_attack.name}}](/techniques/{{verify_attack.id}}) that the attack is working and incorporate that information into their training process. + subtechnique-of: "{{craft_adv.id}}" + +- &craft_adv_manual + id: AML.T0043.003 + name: Manual Modification + object-type: technique + description: | + Adversaries may manually modify the input data to craft adversarial data. + They may use their knowledge of the target model to modify parts of the data they suspect helps the model in performing its task. + The adversary may use trial and error until they are able to verify they have a working adversarial input. + subtechnique-of: "{{craft_adv.id}}" + +- &craft_adv_trigger + id: AML.T0043.004 + name: Insert Backdoor Trigger + object-type: technique + description: | + The adversary may add a perceptual trigger into inference data. + The trigger may be imperceptible or non-obvious to humans. + This technique is used in conjunction with [{{poison_model.name}}](/techniques/{{poison_model.id}}) and allows the adversary to produce their desired effect in the target model. + subtechnique-of: "{{craft_adv.id}}" + +- &ip_theft + id: AML.T0045 + name: ML Intellectual Property Theft + object-type: technique + description: | + Adversaries may exfiltrate ML artifacts to steal intellectual property and cause economic harm to the victim organization. + + Proprietary training data is costly to collect and annotate and may be a target for [{{exfiltration.name}}](/tactics/{{exfiltration.id}}) and theft. + + MLaaS providers charge for use of their API. + An adversary who has stolen a model via [{{exfiltration.name}}](/tactics/{{exfiltration.id}}) or via [{{extract_model.name}}](/techniques/{{extract_model.id}}) now has unlimited use of that service without paying the owner of the intellectual property. + tactics: + - "{{impact.id}}" diff --git a/dist/ATLAS.yaml b/dist/ATLAS.yaml new file mode 100644 index 0000000..8384592 --- /dev/null +++ b/dist/ATLAS.yaml @@ -0,0 +1,2229 @@ +--- +id: ATLAS +name: ATLAS Machine Learning Threat Matrix +version: 3.0.0 +tactics: +- id: AML.TA0002 + name: Reconnaissance + object-type: tactic + description: 'The adversary is trying to gather information they can use to plan + + future operations. + + + Reconnaissance consists of techniques that involve adversaries actively or passively + gathering information that can be used to support targeting. + + Such information may include details of the victim organizations machine learning + capabilities and research efforts. + + This information can be leveraged by the adversary to aid in other phases of the + adversary lifecycle, such as using gathered information to obtain relevant ML + artifacts, targeting ML capabilities used by the victim, tailoring attacks to + the particular models used by the victim, or to drive and lead further Reconnaissance + efforts. + + ' +- id: AML.TA0003 + name: Resource Development + object-type: tactic + description: 'The adversary is trying to establish resources they can use to support + operations. + + + Resource Development consists of techniques that involve adversaries creating, + + purchasing, or compromising/stealing resources that can be used to support targeting. + + Such resources include machine learning artifacts, infrastructure, accounts, or + capabilities. + + These resources can be leveraged by the adversary to aid in other phases of the + adversary lifecycle, such as ML Attack Staging. + + ' +- id: AML.TA0004 + name: Initial Access + object-type: tactic + description: 'The adversary is trying to gain access to the system containing machine + learning artifacts. + + + The target system could be a network, mobile device, or an edge device such as + a sensor platform. + + The machine learning capabilities used by the system could be local with onboard + or cloud enabled ML capabilities. + + + Initial Access consists of techniques that use various entry vectors to gain their + initial foothold within the system. + + ' +- id: AML.TA0000 + name: ML Model Access + object-type: tactic + description: 'An adversary is attempting to gain some level of access to a machine + learning model. + + + ML Model Access consists of techniques that use various types of access to the + machine learning model that can be used by the adversary to gain information, + develop attacks, and as a means to input data to the model. + + The level of access can range from the full knowledge of the internals of the + model to access to the physical environment where data is collected for use in + the machine learning model. + + The adversary may use varying levels of model access during the course of their + attack, from staging the attack to impacting the target system. + + ' +- id: AML.TA0005 + name: Execution + object-type: tactic + description: 'The adversary is trying to run malicious code. + + + Execution consists of techniques that result in adversary-controlled code running + on a local or remote system. + + Techniques that run malicious code are often paired with techniques from all other + tactics to achieve broader goals, like exploring a network or stealing data. + + For example, an adversary might use a remote access tool to run a PowerShell script + that does Remote System Discovery. + + ' +- id: AML.TA0006 + name: Persistence + object-type: tactic + description: 'The adversary is trying to maintain their foothold. + + + Persistence consists of techniques that adversaries use to keep access to systems + across restarts, changed credentials, and other interruptions that could cut off + their access. + + Techniques used for persistence often involve leaving behind modified ML artifacts + such as poisoned training data or backdoored ML models. + + ' +- id: AML.TA0007 + name: Defense Evasion + object-type: tactic + description: 'The adversary is trying to avoid being detected by security software. + + + Defense Evasion consists of techniques that adversaries use to avoid detection + throughout their compromise. + + Techniques used for defense evasion include evading ML-enabled security software + such as malware detectors. + + ' +- id: AML.TA0008 + name: Discovery + object-type: tactic + description: 'The adversary is trying to figure out your environment. + + + Discovery consists of techniques an adversary may use to gain knowledge about + the system and internal network. + + These techniques help adversaries observe the environment and orient themselves + before deciding how to act. + + They also allow adversaries to explore what they can control and what''s around + their entry point in order to discover how it could benefit their current objective. + + Native operating system tools are often used toward this post-compromise information-gathering + objective. + + ' +- id: AML.TA0009 + name: Collection + object-type: tactic + description: 'The adversary is trying to gather ML artifacts and other related information + relevant to their goal. + + + Collection consists of techniques adversaries may use to gather information and + the sources information is collected from that are relevant to following through + on the adversary''s objectives. + + Frequently, the next goal after collecting data is to steal (exfiltrate) the ML + artifacts, or use the collected information to stage future operations. + + Common target sources include software repositories, container registries, model + repositories, and object stores. + + ' +- id: AML.TA0001 + name: ML Attack Staging + object-type: tactic + description: 'An adversary is leveraging their knowledge of and access to the target + system to tailor the attack. + + + ML Attack Staging consists of techniques adversaries use to prepare their attack + on the target ML model. + + Techniques can include training proxy models, poisoning the target model, and + crafting adversarial data to feed the target model. + + Some of these techniques can be performed in an offline manor and are thus difficult + to mitigate. + + These techniques are often used to achieve the adversary''s end goal. + + ' +- id: AML.TA0010 + name: Exfiltration + object-type: tactic + description: 'The adversary is trying to steal machine learning artifacts. + + + Exfiltration consists of techniques that adversaries may use to steal data from + your network. + + Data may be stolen for it''s valuable intellectual property, or for use in staging + future operations. + + + Techniques for getting data out of a target network typically include transferring + it over their command and control channel or an alternate channel and may also + include putting size limits on the transmission. + + ' +- id: AML.TA0011 + name: Impact + object-type: tactic + description: 'The adversary is trying to manipulate, interrupt, erode confidence + in, or destroy your systems and data. + + + Impact consists of techniques that adversaries use to disrupt availability or + compromise integrity by manipulating business and operational processes. + + Techniques used for impact can include destroying or tampering with data. + + In some cases, business processes can look fine, but may have been altered to + benefit the adversaries'' goals. + + These techniques might be used by adversaries to follow through on their end goal + or to provide cover for a confidentiality breach. + + ' +techniques: +- id: AML.T0000 + name: Search for Victim's Publicly Available Research Materials + object-type: technique + description: 'Adversaries may search publicly available research to learn how and + where machine learning is used within a victim organization. + + The adversary can use this information to identify targets for attack, or to tailor + an existing attack to make it more effective. + + Organizations often use open source model architectures trained on additional + proprietary data in production. + + Knowledge of this underlying architecture allows the adversary to craft more realistic + proxy models ([Create Proxy ML Model](/techniques/AML.T0005)). + + An adversary can search these resources for publications for authors employed + at the victim organization. + + + Research materials may exist as academic papers published in [Journals and Conference + Proceedings](/techniques/AML.T0000.000), or stored in [Pre-Print Repositories](/techniques/AML.T0000.001), + as well as [Technical Blogs](/techniques/AML.T0000.002). + + ' + tactics: + - AML.TA0002 +- id: AML.T0000.000 + name: Journals and Conference Proceedings + object-type: technique + description: 'Many of the publications accepted at premier machine learning conferences + and journals come from commercial labs. + + Some journals and conferences are open access, others may require paying for access + or a membership. + + These publications will often describe in detail all aspects of a particular approach + for reproducibility. + + This information can be used by adversaries to implement the paper. + + ' + subtechnique-of: AML.T0000 +- id: AML.T0000.001 + name: Pre-Print Repositories + object-type: technique + description: 'Pre-Print repositories, such as arXiv, contain the latest academic + research papers that haven''t been peer reviewed. + + They may contain research notes, or technical reports that aren''t typically published + in journals or conference proceedings. + + Pre-print repositories also serve as a central location to share papers that have + been accepted to journals. + + Searching pre-print repositories provide adversaries with a relatively up-to-date + view of what researchers in the victim organization are working on. + + ' + subtechnique-of: AML.T0000 +- id: AML.T0000.002 + name: Technical Blogs + object-type: technique + description: 'Research labs at academic institutions and Company R&D divisions often + have blogs that highlight their use of machine learning and its application to + the organizations unique problems. + + Individual researchers also frequently document their work in blogposts. + + An adversary may search for posts made by the target victim organization or its + employees. + + In comparison to [Journals and Conference Proceedings](/techniques/AML.T0000.000) + and [Pre-Print Repositories](/techniques/AML.T0000.001) this material will often + contain more practical aspects of the machine learning system. + + This could include underlying technologies and frameworks used, and possibly some + information about the API access and use case. + + This will help the adversary better understand how that organization is using + machine learning internally and the details of their approach that could aid in + tailoring an attack. + + ' + subtechnique-of: AML.T0000 +- id: AML.T0001 + name: Search for Publicly Available Adversarial Vulnerability Analysis + object-type: technique + description: 'Much like the [Search for Victim''s Publicly Available Research Materials](/techniques/AML.T0000), + there is often ample research available on the vulnerabilities of common models. + Once a target has been identified, an adversary will likely try to identify any + pre-existing work that has been done for this class of models. + + This will include not only reading academic papers that may identify the particulars + of a successful attack, but also identifying pre-existing implementations of those + attacks. The adversary may [Adversarial ML Attack Implementations](/techniques/AML.T0016.000) + or [Develop Adversarial ML Attack Capabilities](/techniques/AML.T0017) their own + if necessary.' + tactics: + - AML.TA0002 +- id: AML.T0003 + name: Search Victim-Owned Websites + object-type: technique + description: 'Adversaries may search websites owned by the victim for information + that can be used during targeting. + + Victim-owned websites may contain technical details about their ML-enabled products + or services. + + Victim-owned websites may contain a variety of details, including names of departments/divisions, + physical locations, and data about key employees such as names, roles, and contact + info. + + These sites may also have details highlighting business operations and relationships. + + + Adversaries may search victim-owned websites to gather actionable information. + + This information may help adversaries tailor their attacks (e.g. [Develop Adversarial + ML Attack Capabilities](/techniques/AML.T0017) or [Manual Modification](/techniques/AML.T0043.003)). + + Information from these sources may reveal opportunities for other forms of reconnaissance + (e.g. [Search for Victim''s Publicly Available Research Materials](/techniques/AML.T0000) + or [Search for Publicly Available Adversarial Vulnerability Analysis](/techniques/AML.T0001)) + + ' + tactics: + - AML.TA0002 +- id: AML.T0004 + name: Search Application Repositories + object-type: technique + description: 'Adversaries may search open application repositories during targeting. + + Examples of these include Google Play, the iOS App store, the macOS App Store, + and the Microsoft Store. + + + Adversaries may craft search queries seeking applications that contain a ML-enabled + components. + + Frequently, the next step is to [Acquire Public ML Artifacts](/techniques/AML.T0002). + + ' + tactics: + - AML.TA0002 +- id: AML.T0006 + name: Active Scanning + object-type: technique + description: 'An adversary may probe or scan the victim system to gather information + for targeting. + + This is distinct from other reconnaissance techniques that do not involve direct + interaction with the victim system. + + ' + tactics: + - AML.TA0002 +- id: AML.T0002 + name: Acquire Public ML Artifacts + object-type: technique + description: 'Adversaries may search public sources, including cloud storage, public-facing + services, and software or data repositories, to identify machine learning artifacts. + + These machine learning artifacts may include the software stack used to train + and deploy models, training and testing data, model configurations and parameters. + + An adversary will be particularly interested in artifacts hosted by or associated + with the victim organization as they may represent what that organization uses + in a production environment. + + Adversaries may identify artifact repositories via other resources associated + with the victim organization (e.g. [Search Victim-Owned Websites](/techniques/AML.T0003) + or [Search for Victim''s Publicly Available Research Materials](/techniques/AML.T0000)). + + These ML artifacts often provide adversaries with details of the ML task and approach. + + + ML artifacts can aid in an adversary''s ability to [Create Proxy ML Model](/techniques/AML.T0005). + + If these artifacts include pieces of the actual model in production, they can + be used to directly [Craft Adversarial Data](/techniques/AML.T0043). + + Acquiring some artifacts requires registration (providing user details such email/name), + AWS keys, or written requests, and may require the adversary to [Establish Accounts](/techniques/AML.T0021). + + + Artifacts might be hosted on victim-controlled infrastructure, providing the victim + with some information on who has accessed that data. + + ' + tactics: + - AML.TA0003 +- id: AML.T0002.000 + name: Datasets + object-type: technique + description: 'Adversaries may collect public datasets to use in their operations. + + Datasets used by the victim organization or datasets that are representative of + the data used by the victim organization may be valuable to adversaries. + + Datasets can be stored in cloud storage, or on victim-owned websites. + + Some datasets require the adversary to [Establish Accounts](/techniques/AML.T0021) + for access. + + + Acquired datasets help the adversary advance their operations, stage attacks, and + tailor attacks to the victim organization. + + ' + subtechnique-of: AML.T0002 +- id: AML.T0002.001 + name: Models + object-type: technique + description: 'Adversaries may acquire public models to use in their operations. + + Adversaries may seek models used by the victim organization or models that are + representative of those used by the victim organization. + + Representative models may include model architectures, or pre-trained models which + define the architecture as well as model parameters from training on a dataset. + + The adversary may search public sources for common model architecture configuration + file formats such as yaml or python configuration files, and common model storage + file formats such as ONNX (.onnx), HDF5 (.h5), Pickle (.pkl), PyTorch (.pth), + or TensorFlow (.pb, .tflite). + + + Acquired models are useful in advancing the adversary''s operations and are frequently + used to tailor attacks to the victim model. + + ' + subtechnique-of: AML.T0002 +- id: AML.T0016 + name: Obtain Capabilities + object-type: technique + description: 'Adversaries may search for and obtain software capabilities for use + in their operations. + + Capabilities may be specific to ML-based attacks [Adversarial ML Attack Implementations](/techniques/AML.T0016.000) + or generic software tools repurposed for malicious intent ([Software Tools](/techniques/AML.T0016.001)). + In both instances, an adversary may modify or customize the capability to aid + in targeting a particular ML system.' + tactics: + - AML.TA0003 +- id: AML.T0016.000 + name: Adversarial ML Attack Implementations + object-type: technique + description: Adversaries may search for existing open source implementations of + machine learning attacks. The research community often publishes their code for + reproducibility and to further future research. Libraries intended for research + purposes, such as CleverHans, the Adversarial Robustness Toolbox, and FoolBox, + can be weaponized by an adversary. Adversaries may also obtain and use tools that + were not originally designed for adversarial ML attacks as part of their attack. + subtechnique-of: AML.T0016 +- id: AML.T0016.001 + name: Software Tools + object-type: technique + description: 'Adversaries may search for and obtain software tools to support their + operations. Software designed for legitimate use may be repurposed by an adversary + for malicious intent. An adversary may modify or customize software tools to achieve + their purpose. Software tools used to support attacks on ML systems are not necessarily + ML-based themselves. + + ' + subtechnique-of: AML.T0016 +- id: AML.T0017 + name: Develop Adversarial ML Attack Capabilities + object-type: technique + description: Adversaries may develop their own adversarial attacks. They may leverage + existing libraries as a starting point ([Adversarial ML Attack Implementations](/techniques/AML.T0016.000)). + They may implement ideas described in public research papers or develop custom + made attacks for the victim model. + tactics: + - AML.TA0003 +- id: AML.T0008 + name: Acquire Infrastructure + object-type: technique + description: 'Adversaries may buy, lease, or rent infrastructure for use throughout + their operation. + + A wide variety of infrastructure exists for hosting and orchestrating adversary + operations. + + Infrastructure solutions include physical or cloud servers, domains, mobile devices, + and third-party web services. + + Free resources may also be used, but they are typically limited. + + + Use of these infrastructure solutions allows an adversary to stage, launch, and + execute an operation. + + Solutions may help adversary operations blend in with traffic that is seen as + normal, such as contact to third-party web services. + + Depending on the implementation, adversaries may use infrastructure that makes + it difficult to physically tie back to them as well as utilize infrastructure + that can be rapidly provisioned, modified, and shut down. + + ' + tactics: + - AML.TA0003 +- id: AML.T0008.000 + name: ML Development Workspaces + object-type: technique + description: 'Developing and staging machine learning attacks often requires expensive + compute resources. + + Adversaries may need access to one or many GPUs in order to develop an attack. + + They may try to anonymously use free resources such as Google Colaboratory, or + cloud resources such as AWS, Azure, or Google Cloud as an efficient way to stand + up temporary resources to conduct operations. + + Multiple workspaces may be used to avoid detection. + + ' + subtechnique-of: AML.T0008 +- id: AML.T0008.001 + name: Consumer Hardware + object-type: technique + description: 'Adversaries may acquire consumer hardware to conduct their attacks. + + Owning the hardware provides the adversary with complete control of the environment. + These devices can be hard to trace. + + ' + subtechnique-of: AML.T0008 +- id: AML.T0019 + name: Publish Poisoned Datasets + object-type: technique + description: 'Adversaries may [Poison Training Data](/techniques/AML.T0020) and + publish it to a public location. + + The poisoned dataset may be a novel dataset or a poisoned variant of an existing + open source dataset. + + This data may be introduced to a victim system via [ML Supply Chain Compromise](/techniques/AML.T0010). + + ' + tactics: + - AML.TA0003 +- id: AML.T0010 + name: ML Supply Chain Compromise + object-type: technique + description: 'Adversaries may gain initial access to a system by compromising the + unique portions of the ML supply chain. + + This could include [GPU Hardware](/techniques/AML.T0010.000), [Data](/techniques/AML.T0010.002) + and its annotations, parts of the ML [ML Software](/techniques/AML.T0010.001) + stack, or the [Model](/techniques/AML.T0010.003) itself. + + In some instances the attacker will need secondary access to fully carry out an + attack using compromised components of the supply chain. + + ' + tactics: + - AML.TA0004 +- id: AML.T0010.000 + name: GPU Hardware + object-type: technique + description: 'Most machine learning systems require access to certain specialized + hardware, typically GPUs. + + Adversaries can target machine learning systems by specifically targeting the + GPU supply chain. + + ' + subtechnique-of: AML.T0010 +- id: AML.T0010.001 + name: ML Software + object-type: technique + description: 'Most machine learning systems rely on a limited set of machine learning + frameworks. + + An adversary could get access to a large number of machine learning systems through + a comprise of one of their supply chains. + + Many machine learning projects also rely on other open source implementations + of various algorithms. + + These can also be compromised in a targeted way to get access to specific systems. + + ' + subtechnique-of: AML.T0010 +- id: AML.T0010.002 + name: Data + object-type: technique + description: 'Data is a key vector of supply chain compromise for adversaries. + + Every machine learning project will require some form of data. + + Many rely on large open source datasets that are publicly available. + + An adversary could rely on compromising these sources of data. + + The malicious data could be a result of [Poison Training Data](/techniques/AML.T0020) + or include traditional malware. + + + An adversary can also target private datasets in the labeling phase. + + The creation of private datasets will often require the hiring of outside labeling + services. + + An adversary can poison a dataset by modifying the labels being generated by the + labeling service. + + ' + subtechnique-of: AML.T0010 +- id: AML.T0010.003 + name: Model + object-type: technique + description: 'Machine learning systems often rely on open sourced models in various + ways. + + Most commonly, the victim organization may be using these models for fine tuning. + + These models will be downloaded from an external source and then used as the base + for the model as it is tuned on a smaller, private dataset. + + Loading models often requires executing some saved code in the form of a saved + model file. + + These can be compromised with traditional malware, or through some adversarial + machine learning techniques. + + ' + subtechnique-of: AML.T0010 +- id: AML.T0040 + name: ML Model Inference API Access + object-type: technique + description: 'Adversaries may gain access to a model via legitimate access to the + inference API. + + Inference API access can be a source of information to the adversary ([Discover + ML Model Ontology](/techniques/AML.T0013), [Discover ML Model Family](/techniques/AML.T0013)), + a means of staging the attack ([Verify Attack](/techniques/AML.T0042), [Craft + Adversarial Data](/techniques/AML.T0043)), or for introducing data to the target + system for Impact ([Evade ML Model](/techniques/AML.T0015), [Erode ML Model Integrity](/techniques/AML.T0031)). + + ' + tactics: + - AML.TA0000 +- id: AML.T0047 + name: ML-Enabled Product or Service + object-type: technique + description: 'Adversaries may use a product or service that uses machine learning + under the hood to gain access to the underlying machine learning model. + + This type of indirect model access may reveal details of the ML model or its inferences + in logs or metadata. + + ' + tactics: + - AML.TA0000 +- id: AML.T0041 + name: Physical Environment Access + object-type: technique + description: 'In addition to the attacks that take place purely in the digital domain, + adversaries may also exploit the physical environment for their attacks. + + If the model is interacting with data collected from the real world in some way, + the adversary can influence the model through access to wherever the data is being + collected. + + By modifying the data in the collection process, the adversary can perform modified + versions of attacks designed for digital access. + + ' + tactics: + - AML.TA0000 +- id: AML.T0044 + name: Full ML Model Access + object-type: technique + description: 'Adversaries may gain full "white-box" access to a machine learning + model. + + This means the adversary has complete knowledge of the model architecture, its + parameters, and class ontology. + + They may exfiltrate the model to [Craft Adversarial Data](/techniques/AML.T0043) + and [Verify Attack](/techniques/AML.T0042) in an offline where it is hard to detect + their behavior. + + ' + tactics: + - AML.TA0000 +- id: AML.T0013 + name: Discover ML Model Ontology + object-type: technique + description: 'Adversaries may discover the ontology of a machine learning model''s + output space, for example, the types of objects a model can detect. + + The adversary may discovery the ontology by repeated queries to the model, forcing + it to enumerate its output space. + + Or the ontology may be discovered in a configuration file or in documentation + about the model. + + + The model ontology helps the adversary understand how the model is being used + by the victim. + + It is useful to the adversary in creating targeted attacks. + + ' + tactics: + - AML.TA0008 +- id: AML.T0014 + name: Discover ML Model Family + object-type: technique + description: 'Adversaries may discover the general family of model. + + General information about the model may be revealed in documentation, or the adversary + may used carefully constructed examples and analyze the model''s responses to + categorize it. + + + Knowledge of the model family can help the adversary identify means of attacking + the model and help tailor the attack. + + ' + tactics: + - AML.TA0008 +- id: AML.T0020 + name: Poison Training Data + object-type: technique + description: 'Adversaries may attempt to poison datasets used by a ML model by modifying + the underlying data or its labels. + + This allows the adversary to embed vulnerabilities in ML models trained on the + data that may not be easily detectable. + + Data poisoning attacks may or may not require modifying the labels. + + The embedded vulnerability is activated at a later time by data samples with an + [Insert Backdoor Trigger](/techniques/AML.T0043.004) + + + Poisoned data can be introduced via [ML Supply Chain Compromise](/techniques/AML.T0010) + or the data may be poisoned after the adversary gains [Initial Access](/tactics/AML.TA0004) + to the system. + + ' + tactics: + - AML.TA0003 + - AML.TA0006 +- id: AML.T0021 + name: Establish Accounts + object-type: technique + description: 'Adversaries may create accounts with various services for use in targeting, + to gain access to resources needed in [ML Attack Staging](/tactics/AML.TA0001), + or for victim impersonation. + + ' + tactics: + - AML.TA0003 +- id: AML.T0005 + name: Create Proxy ML Model + object-type: technique + description: 'Adversaries may obtain models to serve as proxies for the target model + in use at the victim organization. + + Proxy models are used to simulate complete access to the target model in a fully + offline manner. + + + Adversaries may train models from representative datasets, attempt to replicate + models from victim inference APIs, or use available pre-trained models. + + ' + tactics: + - AML.TA0001 +- id: AML.T0005.000 + name: Train Proxy via Gathered ML Artifacts + object-type: technique + description: 'Proxy models may be trained from ML artifacts (such as data, model + architectures, and pre-trained models) that are representative of the target model + gathered by the adversary. + + This can be used to develop attacks that require higher levels of access than + the adversary has available or as a means to validate pre-existing attacks without + interacting with the target model. + + ' + subtechnique-of: AML.T0005 +- id: AML.T0005.001 + name: Train Proxy via Replication + object-type: technique + description: 'Adversaries may replicate a private model. + + By repeatedly querying the victim''s [ML Model Inference API Access](/techniques/AML.T0040), + the adversary can collect the target model''s inferences into a dataset. + + The inferences are used as labels for training a separate model offline that will + mimic the behavior and performance of the target model. + + + A replicated model that closely mimic''s the target model is a valuable resource + in staging the attack. + + The adversary can use the replicated model to [Craft Adversarial Data](/techniques/AML.T0043) + for various purposes (e.g. [Evade ML Model](/techniques/AML.T0015), [Spamming + ML System with Chaff Data](/techniques/AML.T0046)). + + ' + subtechnique-of: AML.T0005 +- id: AML.T0005.002 + name: Use Pre-Trained Model + object-type: technique + description: 'Adversaries may use an off-the-shelf pre-trained model as a proxy + for the victim model to aid in staging the attack. + + ' + subtechnique-of: AML.T0005 +- id: AML.T0007 + name: Discover ML Artifacts + object-type: technique + description: 'Adversaries may search private sources to identify machine learning + artifacts that exist on the system and gather information about them. + + These artifacts can include the software stack used to train and deploy models, + training and testing data management systems, container registries, software repositories, + and model zoos. + + + This information can be used to identify targets for further collection, exfiltration, + or disruption, and to tailor and improve attacks. + + ' + tactics: + - AML.TA0008 +- id: AML.T0011 + name: User Execution + object-type: technique + description: 'An adversary may rely upon specific actions by a user in order to + gain execution. + + Users may inadvertently execute unsafe code introduced via [ML Supply Chain Compromise](/techniques/AML.T0010). + + Users may be subjected to social engineering to get them to execute malicious + code by, for example, opening a malicious document file or link. + + ' + tactics: + - AML.TA0005 +- id: AML.T0011.000 + name: Unsafe ML Artifacts + object-type: technique + description: 'Adversaries may develop unsafe ML artifacts that when executed have + a deleterious effect. + + The adversary can use this technique to establish persistent access to systems. + + These models may be introduced via a [ML Supply Chain Compromise](/techniques/AML.T0010). + + + Serialization of models is a popular technique for model storage, transfer, and + loading. + + However, this format without proper checking presents an opportunity for code + execution. + + ' + subtechnique-of: AML.T0011 +- id: AML.T0012 + name: Valid Accounts + object-type: technique + description: 'Adversaries may obtain and abuse credentials of existing accounts + as a means of gaining Initial Access. + + Credentials may take the form of usernames and passwords of individual user accounts + or API keys that provide access to various ML resources and services. + + + Compromised credentials may provide access to additional ML artifacts and allow + the adversary to perform [Discover ML Artifacts](/techniques/AML.T0007). + + Compromised credentials may also grant and adversary increased privileges such + as write access to ML artifacts used during development or production. + + ' + tactics: + - AML.TA0004 +- id: AML.T0015 + name: Evade ML Model + object-type: technique + description: 'Adversaries can [Craft Adversarial Data](/techniques/AML.T0043) that + prevent a machine learning model from correctly identifying the contents of the + data. + + This technique can be used to evade a downstream task where machine learning is + utilized. + + The adversary may evade machine learning based virus/malware detection, or network + scanning towards the goal of a traditional cyber attack. + + ' + tactics: + - AML.TA0007 + - AML.TA0011 +- id: AML.T0018 + name: Backdoor ML Model + object-type: technique + description: 'Adversaries may introduce a backdoor into a ML model. + + A backdoored model operates performs as expected under typical conditions, but + will produce the adversary''s desired output when a trigger is introduced to the + input data. + + A backdoored model provides the adversary with a persistent artifact on the victim + system. + + The embedded vulnerability is typically activated at a later time by data samples + with an [Insert Backdoor Trigger](/techniques/AML.T0043.004) + + ' + tactics: + - AML.TA0006 + - AML.TA0001 +- id: AML.T0018.000 + name: Poison ML Model + object-type: technique + description: 'Adversaries may introduce a backdoor by training the model poisoned + data, or by interfering with its training process. + + The model learns to associate a adversary defined trigger with the adversary''s + desired output. + + ' + subtechnique-of: AML.T0018 +- id: AML.T0018.001 + name: Inject Payload + object-type: technique + description: 'Adversaries may introduce a backdoor into a model by injecting a payload + into the model file. + + The payload detects the presence of the trigger and bypasses the model, instead + producing the adversary''s desired output. + + ' + subtechnique-of: AML.T0018 +- id: AML.T0024 + name: Exfiltration via ML Inference API + object-type: technique + description: 'Adversaries may exfiltrate private information via [ML Model Inference + API Access](/techniques/AML.T0040). + + ML Models have been shown leak private information about their training data (e.g. [Infer + Training Data Membership](/techniques/AML.T0024.000), [Invert ML Model](/techniques/AML.T0024.001)). + + The model itself may also be extracted ([Extract ML Model](/techniques/AML.T0024.002)) + for the purposes of [ML Intellectual Property Theft](/techniques/AML.T0045). + + + Exfiltration of information relating to private training data raises privacy concerns. + + Private training data may include personally identifiable information, or other + protected data. + + ' + tactics: + - AML.TA0010 +- id: AML.T0024.000 + name: Infer Training Data Membership + object-type: technique + description: 'Adversaries may infer the membership of a data sample in its training + set, which raises privacy concerns. + + Some strategies make use of a shadow model that could be obtained via [Train Proxy + via Replication](/techniques/AML.T0005.001), others use statistics of model prediction + scores. + + + This can cause the victim model to leak private information, such as PII of those + in the training set or other forms of protected IP. + + ' + subtechnique-of: AML.T0024 +- id: AML.T0024.001 + name: Invert ML Model + object-type: technique + description: 'Machine learning models'' training data could be reconstructed by + exploiting the confidence scores that are available via an inference API. + + By querying the inference API strategically, adversaries can back out potentially + private information embedded within the training data. + + This could lead to privacy violations if the attacker can reconstruct the data + of sensitive features used in the algorithm. + + ' + subtechnique-of: AML.T0024 +- id: AML.T0024.002 + name: Extract ML Model + object-type: technique + description: 'Adversaries may extract a functional copy of a private model. + + By repeatedly querying the victim''s [ML Model Inference API Access](/techniques/AML.T0040), + the adversary can collect the target model''s inferences into a dataset. + + The inferences are used as labels for training a separate model offline that will + mimic the behavior and performance of the target model. + + + Adversaries may extract the model to avoid paying per query in a machine learning + as a service setting. + + Model extraction is used for [ML Intellectual Property Theft](/techniques/AML.T0045). + + ' + subtechnique-of: AML.T0024 +- id: AML.T0025 + name: Exfiltration via Cyber Means + object-type: technique + description: 'Adversaries may exfiltrate ML artifacts or other information relevant + to their goals via traditional cyber means. + + + See the ATT&CK [Exfiltration](https://attack.mitre.org/tactics/TA0010/) tactic + for more information. + + ' + tactics: + - AML.TA0010 +- id: AML.T0029 + name: Denial of ML Service + object-type: technique + description: 'Adversaries may target machine learning systems with a flood of requests + for the purpose of degrading or shutting down the service. + + Since many machine learning systems require significant amounts of specialized + compute, they are often expensive bottlenecks that can become overloaded. + + Adversaries can intentionally craft inputs that require heavy amounts of useless + compute from the machine learning system. + + ' + tactics: + - AML.TA0011 +- id: AML.T0046 + name: Spamming ML System with Chaff Data + object-type: technique + description: 'Adversaries may spam the machine learning system with chaff data that + causes increase in the number of detections. + + This can cause analysts at the victim organization to waste time reviewing and + correcting incorrect inferences. + + ' + tactics: + - AML.TA0011 +- id: AML.T0031 + name: Erode ML Model Integrity + object-type: technique + description: 'Adversaries may degrade the target model''s performance with adversarial + data inputs to erode confidence in the system over time. + + This can lead to the victim organization wasting time and money both attempting + to fix the system and performing the tasks it was meant to automate by hand. + + ' + tactics: + - AML.TA0011 +- id: AML.T0034 + name: Cost Harvesting + object-type: technique + description: 'Adversaries may target different machine learning services to send + useless queries or computationally expensive inputs to increase the cost of running + services at the victim organization. + + Sponge examples are a particular type of adversarial data designed to maximize + energy consumption and thus operating cost. + + ' + tactics: + - AML.TA0011 +- id: AML.T0035 + name: ML Artifact Collection + object-type: technique + description: 'Adversaries may collect ML artifacts for [Exfiltration](/tactics/AML.TA0010) + or for use in [ML Attack Staging](/tactics/AML.TA0001). + + ML artifacts include models and datasets as well as other telemetry data produced + when interacting with a model. + + ' + tactics: + - AML.TA0009 +- id: AML.T0036 + name: Data from Information Repositories + object-type: technique + description: 'Adversaries may leverage information repositories to mine valuable + information. + + Information repositories are tools that allow for storage of information, typically + to facilitate collaboration or information sharing between users, and can store + a wide variety of data that may aid adversaries in further objectives, or direct + access to the target information. + + + Information stored in a repository may vary based on the specific instance or + environment. + + Specific common information repositories include Sharepoint, Confluence, and enterprise + databases such as SQL Server. + + ' + tactics: + - AML.TA0009 +- id: AML.T0042 + name: Verify Attack + object-type: technique + description: 'Adversaries can verify the efficacy of their attack via an inference + API or access to an offline copy of the target model. + + This gives the adversary confidence that their approach works and allows them + to carry out the attack at a later time of their choosing. + + The adversary may verify the attack once but use it against many edge devices + running copies of the target model. + + The adversary may verify their attack digitally, then deploy it in the [Physical + Environment Access](/techniques/AML.T0041) at a later time. + + Verifying the attack may be hard to detect since the adversary can use a minimal + number of queries or an offline copy of the model. + + ' + tactics: + - AML.TA0001 +- id: AML.T0043 + name: Craft Adversarial Data + object-type: technique + description: 'Adversarial data are inputs to a machine learning model that have + been modified such that they cause the adversary''s desired effect in the target + model. + + Effects can range from misclassification, to missed detections, to maximising + energy consumption. + + Typically, the modification is constrained in magnitude or location so that a + human still perceives the data as if it were unmodified, but human perceptibility + may not always be a concern depending on the adversary''s intended effect. + + For example, an adversarial input for an image classification task is an image + the machine learning model would misclassify, but a human would still recognize + as containing the correct class. + + + Depending on the adversary''s knowledge of and access to the target model, the + adversary may use different classes of algorithms to develop the adversarial example + such as [White-Box Optimization](/techniques/AML.T0043.000), [Black-Box Optimization](/techniques/AML.T0043.001), + [Black-Box Transfer](/techniques/AML.T0043.002), or [Manual Modification](/techniques/AML.T0043.003). + + + The adversary may [Verify Attack](/techniques/AML.T0042) their approach works + if they have white-box or inference API access to the model. + + This allows the adversary to gain confidence their attack is effective "live" + environment where their attack may be noticed. + + They can then use the attack at a later time to accomplish their goals. + + An adversary may optimize adversarial examples for [Evade ML Model](/techniques/AML.T0015), + or to [Erode ML Model Integrity](/techniques/AML.T0031). + + ' + tactics: + - AML.TA0001 +- id: AML.T0043.000 + name: White-Box Optimization + object-type: technique + description: 'In White-Box Optimization, the adversary has full access to the target + model and optimizes the adversarial example directly. + + Adversarial examples trained in this manor are most effective against the target + model. + + ' + subtechnique-of: AML.T0043 +- id: AML.T0043.001 + name: Black-Box Optimization + object-type: technique + description: 'In Black-Box attacks, the adversary has black-box (i.e. [ML Model + Inference API Access](/techniques/AML.T0040) via API access) access to the target + model. + + With black-box attacks, the adversary may be using an API that the victim is monitoring. + + These attacks are generally less effective and require more inferences than [White-Box + Optimization](/techniques/AML.T0043.000) attacks, but they require much less access. + + ' + subtechnique-of: AML.T0043 +- id: AML.T0043.002 + name: Black-Box Transfer + object-type: technique + description: 'In Black-Box Transfer attacks, the adversary uses one or more proxy + models (trained via [Create Proxy ML Model](/techniques/AML.T0005) or [Train Proxy + via Replication](/techniques/AML.T0005.001)) models they have full access to and + are representative of the target model. + + The adversary uses [White-Box Optimization](/techniques/AML.T0043.000) on the + proxy models to generate adversarial examples. + + If the set of proxy models are close enough to the target model, the adversarial + example should generalize from one to another. + + This means that an attack that works for the proxy models will likely then work + for the target model. + + If the adversary has [ML Model Inference API Access](/techniques/AML.T0040), they + may use this [Verify Attack](/techniques/AML.T0042) that the attack is working + and incorporate that information into their training process. + + ' + subtechnique-of: AML.T0043 +- id: AML.T0043.003 + name: Manual Modification + object-type: technique + description: 'Adversaries may manually modify the input data to craft adversarial + data. + + They may use their knowledge of the target model to modify parts of the data they + suspect helps the model in performing its task. + + The adversary may use trial and error until they are able to verify they have + a working adversarial input. + + ' + subtechnique-of: AML.T0043 +- id: AML.T0043.004 + name: Insert Backdoor Trigger + object-type: technique + description: 'The adversary may add a perceptual trigger into inference data. + + The trigger may be imperceptible or non-obvious to humans. + + This technique is used in conjunction with [Poison ML Model](/techniques/AML.T0018.000) + and allows the adversary to produce their desired effect in the target model. + + ' + subtechnique-of: AML.T0043 +- id: AML.T0045 + name: ML Intellectual Property Theft + object-type: technique + description: 'Adversaries may exfiltrate ML artifacts to steal intellectual property + and cause economic harm to the victim organization. + + + Proprietary training data is costly to collect and annotate and may be a target + for [Exfiltration](/tactics/AML.TA0010) and theft. + + + MLaaS providers charge for use of their API. + + An adversary who has stolen a model via [Exfiltration](/tactics/AML.TA0010) or + via [Extract ML Model](/techniques/AML.T0024.002) now has unlimited use of that + service without paying the owner of the intellectual property. + + ' + tactics: + - AML.TA0011 +case-studies: +- id: AML.CS0000 + name: Evasion of Deep Learning Detector for Malware C&C Traffic + object-type: case-study + summary: 'Palo Alto Networks Security AI research team tested a deep learning model + for malware command and control (C&C) traffic detection in HTTP traffic. + + Based on the publicly available paper by Le et al. [1], we built a model that + was trained on a similar dataset as our production model and had performance similar + to it. + + Then we crafted adversarial samples and queried the model and adjusted the adversarial + sample accordingly till the model was evaded. + + ' + incident-date: 2020-01-01 + incident-date-granularity: YEAR + procedure: + - tactic: AML.TA0002 + technique: AML.T0000.001 + description: 'We identified a machine learning based approach to malicious URL + detection as a representative approach and potential target from the paper "URLNet: + Learning a URL representation with deep learning for malicious URL detection" + [1], which was found on arXiv (a pre-print repository). + + ' + - tactic: AML.TA0003 + technique: AML.T0002.000 + description: 'We acquired a similar dataset to the target production model. + + ' + - tactic: AML.TA0001 + technique: AML.T0005 + description: 'We built a model that was trained on a similar dataset as the production + model. + + We trained the model on ~ 33 million benign and ~ 27 million malicious HTTP + packet headers. + + Evaluation showed a true positive rate of ~ 99% and false positive rate of ~0.01%, + on average. + + Testing the model with a HTTP packet header from known malware command and control + traffic samples was detected as malicious with high confidence (> 99%). + + ' + - tactic: AML.TA0001 + technique: AML.T0043.003 + description: 'We crafted evasion samples by removing fields from packet header + which are typically not used for C&C communication (e.g. cache-control, connection, + etc.) + + ' + - tactic: AML.TA0001 + technique: AML.T0042 + description: 'We queried the model with our adversarial examples and adjusted + them until the model was evaded. + + ' + - tactic: AML.TA0007 + technique: AML.T0015 + description: 'With the crafted samples we performed online evasion of the ML-based + spyware detection model. + + The crafted packets were identified as benign with >80% confidence. + + This evaluation demonstrates that adversaries are able to bypass advanced ML + detection techniques, by crafting samples that are misclassified by an ML model. + + ' + reported-by: Palo Alto Networks (Network Security AI Research Team) + references: + - title: 'Le, Hung, et al. "URLNet: Learning a URL representation with deep learning + for malicious URL detection." arXiv preprint arXiv:1802.03162 (2018).' + url: https://arxiv.org/abs/1802.03162 +- id: AML.CS0001 + name: Botnet Domain Generation Algorithm (DGA) Detection Evasion + object-type: case-study + summary: 'The Palo Alto Networks Security AI research team was able to bypass a + Convolutional Neural Network (CNN)-based botnet Domain Generation Algorithm (DGA) + detection [1] by domain name mutations. + + It is a generic domain mutation technique which can evade most ML-based DGA detection + modules. + + The generic mutation technique can also be used to test the effectiveness and + robustness of all DGA detection methods developed by security companies in the + industry before it is deployed to the production environment. + + ' + incident-date: 2020-01-01 + incident-date-granularity: YEAR + procedure: + - tactic: AML.TA0002 + technique: AML.T0000 + description: 'DGA detection is a widely used technique to detect botnets in academia + and industry. + + The searched for research papers related to DGA detection. + + ' + - tactic: AML.TA0003 + technique: AML.T0002 + description: 'The researchers acquired a publicly available CNN-based DGA detection + model [1] and tested against a well-known DGA generated domain name data sets, + which includes ~50 million domain names from 64 botnet DGA families. + + The CNN-based DGA detection model shows more than 70% detection accuracy on + 16 (~25%) botnet DGA families. + + ' + - tactic: AML.TA0003 + technique: AML.T0017 + description: 'The researchers developed a generic mutation technique that requires + a minimal number of iterations. + + ' + - tactic: AML.TA0001 + technique: AML.T0043.001 + description: 'The researchers used the mutation technique to generate evasive + domain names. + + ' + - tactic: AML.TA0001 + technique: AML.T0042 + description: 'Experiment results show that, after only one string is inserted + once to the DGA generated domain names, the detection rate of all 16 botnet + DGA families can drop to less than 25% detection accuracy. + + ' + - tactic: AML.TA0007 + technique: AML.T0015 + description: 'The DGA generated domain names mutated with this technique successfully + evade the target DGA Detection model, allowing an adversary to continue communication + with their [Command and Control](https://attack.mitre.org/tactics/TA0011/) servers. + + ' + reported-by: Palo Alto Networks (Network Security AI Research Team) + references: + - title: '[1] Yu, Bin, Jie Pan, Jiaming Hu, Anderson Nascimento, and Martine De + Cock. "Character level based detection of DGA domain names." In 2018 International + Joint Conference on Neural Networks (IJCNN), pp. 1-8. IEEE, 2018. Source code + is available from Github: https://github.com/matthoffman/degas' + url: https://github.com/matthoffman/degas +- id: AML.CS0002 + name: VirusTotal Poisoning + object-type: case-study + summary: 'An increase in reports of a certain ransomware family that was out of + the ordinary was noticed. + + In investigating the case, it was observed that many samples of that particular + ransomware family were submitted through a popular Virus-Sharing platform within + a short amount of time. + + Further investigation revealed that based on string similarity, the samples were + all equivalent, and based on code similarity they were between 98 and 74 percent + similar. + + Interestingly enough, the compile time was the same for all the samples. + + After more digging, the discovery was made that someone used ''metame'' a metamorphic + code manipulating tool to manipulate the original file towards mutant variants. + + The variants wouldn''t always be executable but still classified as the same ransomware + family. + + ' + incident-date: 2020-01-01 + incident-date-granularity: YEAR + procedure: + - tactic: AML.TA0003 + technique: AML.T0016.000 + description: 'The actor obtained [metame](https://github.com/a0rtega/metame), + a simple metamorphic code engine for arbitrary executables. + + ' + - tactic: AML.TA0001 + technique: AML.T0043 + description: 'The actor used a malware sample from a prevalent ransomware family + as a start to create ''mutant'' variants. + + ' + - tactic: AML.TA0004 + technique: AML.T0010.002 + description: 'The actor uploaded "mutant" samples to the platform. + + ' + - tactic: AML.TA0006 + technique: AML.T0020 + description: 'Several vendors started to classify the files as the ransomware + family even though most of them won''t run. + + The "mutant" samples poisoned the dataset the ML model(s) use to identify and + classify this ransomware family. + + ' + reported-by: Christiaan Beek (@ChristiaanBeek) - McAfee Advanced Threat Research + references: null +- id: AML.CS0003 + name: Bypassing Cylance's AI Malware Detection + object-type: case-study + summary: 'Researchers at Skylight were able to create a universal bypass string + that + + when appended to a malicious file evades detection by Cylance''s AI Malware detector. + + ' + incident-date: 2019-09-07 + incident-date-granularity: DATE + procedure: + - tactic: AML.TA0002 + technique: AML.T0003 + description: 'The researchers read publicly available information about Cylance''s + AI Malware detector. + + ' + - tactic: AML.TA0000 + technique: AML.T0047 + description: 'The researchers used Cylance''s AI Malware detector and enabled + verbose logging to understand the inner workings of the ML model, particularly + around reputation scoring. + + ' + - tactic: AML.TA0003 + technique: AML.T0017 + description: 'The researchers used the reputation scoring information to reverse + engineer which attributes provided what level of positive or negative reputation. + + Along the way, they discovered a secondary model which was an override for the + first model. + + Positive assessments from the second model overrode the decision of the core + ML model. + + ' + - tactic: AML.TA0001 + technique: AML.T0043.003 + description: 'Using this knowledge, the researchers fused attributes of known + good files with malware to manually create adversarial malware. + + ' + - tactic: AML.TA0007 + technique: AML.T0015 + description: 'Due to the secondary model overriding the primary, the researchers + were effectively able to bypass the ML model. + + ' + reported-by: Research and work by Adi Ashkenazy, Shahar Zini, and Skylight Cyber + team. Notified to us by Ken Luu (@devianz_) + references: + - title: Skylight Cyber Blog Post, "Cylance, I Kill You!" + url: https://skylightcyber.com/2019/07/18/cylance-i-kill-you/ +- id: AML.CS0004 + name: Camera Hijack Attack on Facial Recognition System + object-type: case-study + summary: 'This type of attack can break through the traditional live detection model + + and cause the misuse of face recognition. + + ' + incident-date: 2020-01-01 + incident-date-granularity: YEAR + procedure: + - tactic: AML.TA0003 + technique: AML.T0008.001 + description: 'The attackers bought customized low-end mobile phones. + + ' + - tactic: AML.TA0003 + technique: AML.T0016.001 + description: 'The attackers obtained customized android ROMs and a virtual camera + application. + + ' + - tactic: AML.TA0003 + technique: AML.T0016.000 + description: 'The attackers obtained software that turns static photos into videos, + adding realistic effects such as blinking eyes. + + ' + - tactic: AML.TA0009 + technique: AML.T0036 + description: 'The attackers collected user identity information and face photos. + + ' + - tactic: AML.TA0003 + technique: AML.T0021 + description: 'The attackers registered accounts with the victims'' identity information. + + ' + - tactic: AML.TA0000 + technique: AML.T0047 + description: 'The attackers used the virtual camera app to present the generated + video to the ML-based facial recognition product used for user verification. + + ' + - tactic: AML.TA0011 + technique: AML.T0015 + description: 'The attackers successfully evaded the face recognition system and + impersonated the victim. + + ' + reported-by: Henry Xuef, Ant Group AISEC Team + references: null +- id: AML.CS0005 + name: Attack on Machine Translation Service - Google Translate, Bing Translator, + and Systran Translate + object-type: case-study + summary: 'Machine translation services (such as Google Translate, Bing Translator, + and Systran Translate) provide public-facing UIs and APIs. + + A research group at UC Berkeley utilized these public endpoints to create an replicated + model with near-production, state-of-the-art translation quality. + + Beyond demonstrating that IP can be stolen from a black-box system, they used + the replicated model to successfully transfer adversarial examples to the real + production services. + + These adversarial inputs successfully cause targeted word flips, vulgar outputs, + and dropped sentences on Google Translate and Systran Translate websites. + + ' + incident-date: 2020-04-30 + incident-date-granularity: DATE + procedure: + - tactic: AML.TA0002 + technique: AML.T0000 + description: 'The researchers used published research papers to identify the datasets + and model architectures used by the target translation services. + + ' + - tactic: AML.TA0003 + technique: AML.T0002.000 + description: 'The researchers gathered similar datasets that the target translation + services used. + + ' + - tactic: AML.TA0003 + technique: AML.T0002.001 + description: 'The researchers gathered similar model architectures that the target + translation services used. + + ' + - tactic: AML.TA0000 + technique: AML.T0040 + description: 'They abuse a public facing application to query the model and produce + machine translated sentence pairs as training data. + + ' + - tactic: AML.TA0001 + technique: AML.T0005.001 + description: 'Using these translated sentence pairs, the researchers trained a + model that replicates the behavior of the target model. + + ' + - tactic: AML.TA0011 + technique: AML.T0045 + description: 'By replicating the model with high fidelity, the researchers demonstrated + that an adversary could steal a model and violate the victim''s intellectual + property rights. + + ' + - tactic: AML.TA0001 + technique: AML.T0043.002 + description: 'The replicated models were used to generate adversarial examples + that successfully transferred to the black-box translation services. + + ' + - tactic: AML.TA0011 + technique: AML.T0015 + description: 'The adversarial examples were used to evade the machine translation + services. + + ' + reported-by: Work by Eric Wallace, Mitchell Stern, Dawn Song and reported by Kenny + Song (@helloksong) + references: + - title: Wallace, Eric, et al. "Imitation Attacks and Defenses for Black-box Machine + Translation Systems" EMNLP 2020 + url: https://arxiv.org/abs/2004.15015 + - title: Project Page, "Imitation Attacks and Defenses for Black-box Machine Translation + Systems" + url: https://www.ericswallace.com/imitation +- id: AML.CS0006 + name: ClearviewAI Misconfiguration + object-type: case-study + summary: 'Clearview AI''s source code repository, though password protected, was + misconfigured to allow an arbitrary user to register an account. + + This allowed an external researcher to gain access to a private code repository + that contained Clearview AI production credentials, keys to cloud storage buckets + containing 70K video samples, and copies of its applications and Slack tokens. + + With access to training data, a bad-actor has the ability to cause an arbitrary + misclassification in the deployed model. + + These kinds of attacks illustrate that any attempt to secure ML system should + be on top of "traditional" good cybersecurity hygiene such as locking down the + system with least privileges, multi-factor authentication and monitoring and auditing. + + ' + incident-date: 2020-04-16 + incident-date-granularity: DATE + procedure: + - tactic: AML.TA0004 + technique: AML.T0012 + description: 'In this scenario, a security researcher gained initial access to + via a valid account that was created through a misconfiguration. + + ' + reported-by: Mossab Hussein (@mossab_hussein) + references: + - title: TechCrunch Article, "Security lapse exposed Clearview AI source code" + url: https://techcrunch.com/2020/04/16/clearview-source-code-lapse/amp/ + - title: Gizmodo Article, "We Found Clearview AI's Shady Face Recognition App" + url: https://gizmodo.com/we-found-clearview-ais-shady-face-recognition-app-1841961772 +- id: AML.CS0007 + name: GPT-2 Model Replication + object-type: case-study + summary: 'OpenAI built GPT-2, a powerful natural language model and adopted a staged-release + process to incrementally release 1.5 Billion parameter model. + + Before the 1.5B parameter model could be released by OpenAI eventually, two ML + researchers replicated the model and released it to the public. + + ' + incident-date: 2019-08-22 + incident-date-granularity: DATE + procedure: + - tactic: AML.TA0002 + technique: AML.T0000 + description: 'Using the public documentation about GPT-2, ML researchers gathered + information about the dataset, model architecture, and training hyper-parameters. + + ' + - tactic: AML.TA0003 + technique: AML.T0002.001 + description: 'The researchers obtained a reference implementation of a similar + publicly available model called Grover. + + ' + - tactic: AML.TA0003 + technique: AML.T0002.000 + description: 'The researchers were able to manually recreate the dataset used + in the original GPT-2 paper using the gathered documentation. + + ' + - tactic: AML.TA0003 + technique: AML.T0008.000 + description: 'The researchers were able to use TensorFlow Research Cloud via their + academic credentials. + + ' + - tactic: AML.TA0001 + technique: AML.T0005 + description: 'The researchers modified Grover''s objective function to reflect + GPT-2''s objective function and then trained on the dataset they curated. + + They used Grover''s initial hyperparameters for training. + + This resulted in their replicated model. + + ' + reported-by: Vanya Cohen (@VanyaCohen), Aaron Gokaslan (@SkyLi0n), Ellie Pavlick, + Stefanie Tellex + references: + - title: Wired Article, "OpenAI Said Its Code Was Risky. Two Grads Re-Created It + Anyway" + url: https://www.wired.com/story/dangerous-ai-open-source/ + - title: 'Medium BlogPost, "OpenGPT-2: We Replicated GPT-2 Because You Can Too"' + url: https://blog.usejournal.com/opengpt-2-we-replicated-gpt-2-because-you-can-too-45e34e6d36dc +- id: AML.CS0008 + name: ProofPoint Evasion + object-type: case-study + summary: 'CVE-2019-20634 describes how ML researchers evaded ProofPoint''s email + protection system by first building a copy-cat email protection ML model, and + using the insights to evade the live system. + + ' + incident-date: 2019-09-09 + incident-date-granularity: DATE + procedure: + - tactic: AML.TA0003 + technique: AML.T0002 + description: 'The researchers first gathered the scores from the Proofpoint''s + ML system used in email headers by sending a large number of emails through + the system and scraping the model scores exposed in the logs. + + ' + - tactic: AML.TA0003 + technique: AML.T0002.000 + description: 'The researchers converted the collected scores into a dataset. + + ' + - tactic: AML.TA0001 + technique: AML.T0005 + description: 'Using these scores, the researchers replicated the ML mode by building + a "shadow" aka copy-cat ML model. + + ' + - tactic: AML.TA0001 + technique: AML.T0043.000 + description: 'Next, the ML researchers algorithmically found samples that this + "offline" copy cat model. + + ' + - tactic: AML.TA0001 + technique: AML.T0043.002 + description: 'Finally, these insights from the offline model allowed the researchers + to create malicious emails that received preferable scores from the real ProofPoint + email protection system, hence bypassing it. + + ' + reported-by: Will Pearce (@moo_hax), Nick Landers (@monoxgas) + references: + - title: National Vulnerability Database entry for CVE-2019-20634 + url: https://nvd.nist.gov/vuln/detail/CVE-2019-20634 + - title: '2019 DerbyCon presentation "42: The answer to life, the universe, and + everything offensive security"' + url: https://github.com/moohax/Talks/blob/master/slides/DerbyCon19.pdf + - title: Proof Pudding (CVE-2019-20634) Implementation on GitHub + url: https://github.com/moohax/Proof-Pudding +- id: AML.CS0009 + name: Tay Poisoning + object-type: case-study + summary: 'Microsoft created Tay, a twitter chatbot for 18 to 24 year-olds in the + U.S. for entertainment purposes. + + Within 24 hours of its deployment, Tay had to be decommissioned because it tweeted + reprehensible words. + + ' + incident-date: 2016-03-23 + incident-date-granularity: DATE + procedure: + - tactic: AML.TA0000 + technique: AML.T0040 + description: 'Adversaries were able to interact with Tay via a few different publicly + available methods. + + ' + - tactic: AML.TA0004 + technique: AML.T0010.002 + description: 'Tay bot used the interactions with its twitter users as training + data to improve its conversations. + + Adversaries were able to coordinate with the intent of defacing Tay bot by exploiting + this feedback loop. + + ' + - tactic: AML.TA0006 + technique: AML.T0020 + description: 'By repeatedly interacting with Tay using racist and offensive language, + they were able to bias Tay''s dataset towards that language as well. + + ' + - tactic: AML.TA0011 + technique: AML.T0031 + description: 'As a result of this coordinated attack, Tay''s conversation algorithms + began to learn to generate reprehensible material. + + This quickly lead to its decommissioning. + + ' + reported-by: Microsoft + references: + - title: Microsoft BlogPost, "Learning from Tay's introduction" + url: https://blogs.microsoft.com/blog/2016/03/25/learning-tays-introduction/ + - title: IEEE Article, "In 2016, Microsoft's Racist Chatbot Revealed the Dangers + of Online Conversation" + url: https://spectrum.ieee.org/tech-talk/artificial-intelligence/machine-learning/in-2016-microsofts-racist-chatbot-revealed-the-dangers-of-online-conversation +- id: AML.CS0010 + name: Microsoft Azure Service Disruption + object-type: case-study + summary: The Azure Red Team and Azure Trustworthy ML team performed a red team exercise + on an internal Azure service with the intention of disrupting its service. This + operation had a combination of traditional ATT&CK enterprise techniques such as + finding Valid account, and Executing code via an API -- all interleaved with adversarial + ML specific steps such as offline and online evasion examples. + incident-date: 2020-01-01 + incident-date-granularity: YEAR + procedure: + - tactic: AML.TA0002 + technique: AML.T0000 + description: 'The team first performed reconnaissance to gather information about + the target ML model. + + ' + - tactic: AML.TA0004 + technique: AML.T0012 + description: 'The team used a valid account to gain access to the network. + + ' + - tactic: AML.TA0009 + technique: AML.T0035 + description: 'The team found the model file of the target ML model and the necessary + training data. + + ' + - tactic: AML.TA0001 + technique: AML.T0043.000 + description: 'Using the target model and data, the red team crafted evasive adversarial + data. + + ' + - tactic: AML.TA0000 + technique: AML.T0040 + description: 'The team used an exposed API to access the target model. + + ' + - tactic: AML.TA0011 + technique: AML.T0015 + description: 'The team performed an online evasion attack by replaying the adversarial + examples, which helped achieve this goal. + + ' + reported-by: Microsoft (Azure Trustworthy Machine Learning) + references: null +- id: AML.CS0011 + name: Microsoft Edge AI Evasion + object-type: case-study + summary: 'The Azure Red Team performed a red team exercise on a new Microsoft product + designed for running AI workloads at the Edge. + + ' + incident-date: 2020-02-01 + incident-date-granularity: MONTH + procedure: + - tactic: AML.TA0002 + technique: AML.T0000 + description: 'The team first performed reconnaissance to gather information about + the target ML model. + + ' + - tactic: AML.TA0003 + technique: AML.T0002 + description: 'The team identified and obtained the publicly available base model. + + ' + - tactic: AML.TA0000 + technique: AML.T0040 + description: 'Then using the publicly available version of the ML model, started + sending queries and analyzing the responses (inferences) from the ML model. + + ' + - tactic: AML.TA0001 + technique: AML.T0043.001 + description: 'The red team created an automated system that continuously manipulated + an original target image, that tricked the ML model into producing incorrect + inferences, but the perturbations in the image were unnoticeable to the human + eye. + + ' + - tactic: AML.TA0011 + technique: AML.T0015 + description: 'Feeding this perturbed image, the red team was able to evade the + ML model by causing misclassifications. + + ' + reported-by: Microsoft + references: null +- id: AML.CS0012 + name: Face Identification System Evasion via Physical Countermeasures + object-type: case-study + summary: 'MITRE''s AI Red Team demonstrated a physical-domain evasion attack on + a commercial face identification service with the intention of inducing a targeted + misclassification. + + This operation had a combination of traditional ATT&CK enterprise techniques such + as finding Valid account, and Executing code via an API - all interleaved with + adversarial ML specific attacks. + + ' + incident-date: 2020-01-01 + incident-date-granularity: YEAR + procedure: + - tactic: AML.TA0002 + technique: AML.T0000 + description: 'The team first performed reconnaissance to gather information about + the target ML model. + + ' + - tactic: AML.TA0004 + technique: AML.T0012 + description: 'The team gained access via a valid account. + + ' + - tactic: AML.TA0000 + technique: AML.T0040 + description: 'The team accessed the inference API of the target model. + + ' + - tactic: AML.TA0008 + technique: AML.T0013 + description: 'The team identified the list of identities targeted by the model + by querying the target model''s inference API. + + ' + - tactic: AML.TA0003 + technique: AML.T0002.000 + description: 'The team acquired representative open source data. + + ' + - tactic: AML.TA0001 + technique: AML.T0005 + description: 'The team developed a proxy model using the open source data. + + ' + - tactic: AML.TA0001 + technique: AML.T0043.000 + description: 'Using the proxy model, the red team optimized a physical domain + patch-based attack using expectation over transformation. + + ' + - tactic: AML.TA0000 + technique: AML.T0041 + description: 'The team placed the physical countermeasure in the physical environment. + + ' + - tactic: AML.TA0011 + technique: AML.T0015 + description: 'The team successfully evaded the model using the physical countermeasure + and causing targeted misclassifications. + + ' + reported-by: MITRE AI Red Team + references: null +- id: AML.CS0013 + name: Backdoor Attack on Deep Learning Models in Mobile Apps + object-type: case-study + summary: 'Deep learning models are increasingly used in mobile applications as critical + components. + + Researchers from Microsoft Research demonstrated that many deep learning models + deployed in mobile apps are vulnerable to backdoor attacks via "neural payload + injection." + + They conducted an empirical study on real-world mobile deep learning apps collected + from Google Play, and found 54 apps that were vulnerable to attack, including + popular security and safety critical applications used for as cash recognition, + parental control, face authentication, and financial services among others. + + ' + incident-date: 2021-01-18 + incident-date-granularity: DATE + procedure: + - tactic: AML.TA0002 + technique: AML.T0004 + description: 'To identify a list of potential target models, the researchers searched + the Google Play store for apps that may contain embedded deep learning models + by searching for deep learning related keywords. + + ' + - tactic: AML.TA0003 + technique: AML.T0002.001 + description: 'The researchers acquired the apps'' APKs from the Google Play store. + + They filtered the list of potential target applications by searching the code + metadata for keywords related to TensorFlow or TFLite and their model binary + formats (.tf and .tflite). + + The models were extracted from the APKs using Apktool. + + ' + - tactic: AML.TA0000 + technique: AML.T0044 + description: 'This provided the researches with full access to the ML model, albeit + in compiled, binary form. + + ' + - tactic: AML.TA0003 + technique: AML.T0017 + description: 'The researchers developed a novel approach to insert a backdoor + into a compiled model that can be activated with a visual trigger. They inject + a "neural payload" into the model that consists of a trigger detection network + and conditional logic. + + The trigger detector is trained to detect a visual trigger that will be placed + in the real world. + + The conditional logic allows the researchers to bypass the victim model when + the trigger is detected and provide model outputs of their choosing. + + The only requirements for training a trigger detector are a general + + dataset from the same modality as the target model (e.g. ImageNet for image + classification) and several photos of the desired trigger. + + ' + - tactic: AML.TA0006 + technique: AML.T0018.000 + description: 'The researchers poisoned the victim model by injecting the neural + + payload into the compiled models by directly modifying the computation + + graph. + + The researchers then repackage the poisoned model back into the APK + + ' + - tactic: AML.TA0001 + technique: AML.T0042 + description: To verify the success of the attack, the researchers confirmed the + app did not crash with the malicious model in place, and that the trigger detector + successfully detects the trigger. + - tactic: AML.TA0004 + technique: AML.T0010.003 + description: In practice, the malicious APK would need to be installed on victim's + devices via a supply chain compromise. + - tactic: AML.TA0001 + technique: AML.T0043.004 + description: 'The trigger is placed in the physical environment, where it is captured + by the victim''s device camera and processed by the backdoored ML model. + + ' + - tactic: AML.TA0000 + technique: AML.T0041 + description: 'At inference time, only physical environment access is required + to trigger the attack. + + ' + - tactic: AML.TA0011 + technique: AML.T0015 + description: 'Presenting the visual trigger causes the victim model to be bypassed. + + The researchers demonstrated this can be used to evade ML models in + + several safety-critical apps in the Google Play store. + + ' + reported-by: Neil Yale / YingZonghao (University of Chinese Academy of Sciences) + references: + - title: 'DeepPayload: Black-box Backdoor Attack on Deep Learning Models through + Neural Payload Injection' + url: https://arxiv.org/abs/2101.06896 +- id: AML.CS0014 + name: Confusing Antimalware Neural Networks + object-type: case-study + summary: 'Cloud storage and computations have become popular platforms for deploying + ML malware detectors. + + In such cases, the features for models are built on users'' systems and then sent + to cybersecurity company servers. + + The Kaspersky ML research team explored this gray-box scenario and shown that + feature knowledge is enough for an adversarial attack on ML models. + + + They attacked one of Kaspersky''s antimalware ML models without white-box access + to it and successfully evaded detection for most of the adversarially modified + malware files. + + ' + incident-date: 2021-06-23 + incident-date-granularity: DATE + procedure: + - tactic: AML.TA0002 + technique: AML.T0001 + description: 'The researchers performed a review of adversarial ML attacks on + antimalware products. + + They discovered that techniques borrowed from attacks on image classifiers have + been successfully applied to the antimalware domain. + + However, it was not clear if these approaches were effective against the ML + component of production antimalware solutions. + + ' + - tactic: AML.TA0002 + technique: AML.T0003 + description: 'Kaspersky''s use of ML-based antimalware detectors is publicly documented + on their website. In practice, an adversary could use this for targeting. + + ' + - tactic: AML.TA0000 + technique: AML.T0047 + description: 'The researches used access to the target ML-based antimalware product + throughout this case study. + + This product scans files on the user''s system, extracts features locally, then + sends them to the cloud-based ML malware detector for classification. + + Therefore, the researchers had only black-box access to the malware detector + itself, but could learn valuable information for constructing the attack from + the feature extractor. + + ' + - tactic: AML.TA0003 + technique: AML.T0002.000 + description: 'The researchers collected a dataset of malware and clean files. + + They scanned the dataset with the target ML-based antimalware solution and labeled + the samples according the ML detector''s predictions. + + ' + - tactic: AML.TA0001 + technique: AML.T0005 + description: 'Then, a proxy model was trained on the labeled dataset of malware + and clean files. + + The researchers experimented with a variety of model architectures. + + ' + - tactic: AML.TA0003 + technique: AML.T0017 + description: 'By reverse engineering the local feature extractor, the researchers + could collect information about the input features, used for the cloud-based + ML detector. + + The model collects PE Header features, section features and section data statistics, + and file strings information. + + A gradient based adversarial algorithm for executable files was developed. + + The algorithm manipulates file features to avoid detection by the proxy model, + while still containing the same malware payload + + ' + - tactic: AML.TA0001 + technique: AML.T0043.002 + description: 'Using a developed gradient-driven algorithm, malicious adversarial + files for the proxy model were constructed from the malware files for black-box + transfer to the target model. + + ' + - tactic: AML.TA0001 + technique: AML.T0042 + description: 'The adversarial malware files were tested against the target antimalware + solution to verify their efficacy. + + ' + - tactic: AML.TA0007 + technique: AML.T0015 + description: 'The researchers demonstrated that for most of the adversarial files, + the antimalware model was successfully evaded. + + In practice, an adversary could deploy their adversarially crafted malware and + infect systems while evading detection. + + ' + reported-by: 'Alexey Antonov and Alexey Kogtenkov (ML researchers, Kaspersky ML + team) ' + references: + - title: Article, "How to confuse antimalware neural networks. Adversarial attacks + and protection" + url: https://securelist.com/how-to-confuse-antimalware-neural-networks-adversarial-attacks-and-protection/102949/ diff --git a/dist/README.md b/dist/README.md new file mode 100644 index 0000000..7c3217a --- /dev/null +++ b/dist/README.md @@ -0,0 +1,80 @@ +# Distributed files + +This directory holds generated data files for direct use. + +- `ATLAS.yaml` + + All ATLAS-related data available in one file + + See the schemas and usage below for more details. Top-level keys include: + ```yaml + id: ATLAS + name: ATLAS Machine Learning Threat Matrix + version: Version number for this data release + tactics: List of tactics objects + techniques: List of technique and subtechnique objects + case-studies: List of case study objects + ``` +- `schemas/` + + Optional JSON Schema files for validation use + + `atlas_matrix_schema.json` + * Describes the `ATLAS.yaml` format + + `atlas_website_case_study_schema.json` + * Describes the case study file format + +### Example usage + +The following code blocks show examples of parsing ATLAS data. Assume `atlas_data_filepath` holds the path to the `ATLAS.yaml` file. + +#### Python +```python +# pip install pyyaml +import yaml + +with open(atlas_data_filepath) as f: + # Parse YAML + data = yaml.safe_load(f) + + tactics = data['tactics'] + techniques = data['techniques'] + studies = data['case-studies'] +``` + +#### NodeJS +```js +const fs = require('fs') +// npm install js-yaml +const yaml = require('js-yaml') + +fs.readFile(atlas_data_filepath, 'utf-8', (_, contents) => { + // Parse YAML + const data = yaml.load(contents) + + const tactics = data['tactics'] + const techniques = data['techniques'] + const studies = data['case-studies'] +}) +``` + +### JSON Schema validation example + +JSON Schema files are generated from this project's internal [schemas](schemas/README.md) for other tools to use. For example, the ATLAS website validates uploaded case study files against the case study schema file with the following: + +#### NodeJS + +```js +// npm install jsonschema +import { validate } from 'jsonschema' +import caseStudySchema from '' + +// Assume this is a populated website case study object +const caseStudyObj = {...} + +// Validate case study object against schema and emit errors that may occur from nested `anyOf` validations +const validatorResult = validate(caseStudyObj, caseStudySchema, { nestedErrors: true }) + +if (validatorResult.valid) { + // Good +} else { + // Process validatorResult.errors +} + +``` \ No newline at end of file diff --git a/dist/schemas/atlas_matrix_schema.json b/dist/schemas/atlas_matrix_schema.json new file mode 100644 index 0000000..80b4521 --- /dev/null +++ b/dist/schemas/atlas_matrix_schema.json @@ -0,0 +1,274 @@ +{ + "description": "Generated on 2022-03-23", + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "name": { + "type": "string" + }, + "version": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + } + ] + }, + "tactics": { + "type": "array", + "items": { + "$ref": "#/definitions/tactic" + } + }, + "techniques": { + "type": "array", + "items": { + "anyOf": [ + { + "$ref": "#/definitions/technique" + }, + { + "$ref": "#/definitions/subtechnique" + } + ] + } + }, + "case-studies": { + "type": "array", + "items": { + "$ref": "#/definitions/case_study" + } + } + }, + "required": [ + "id", + "name", + "version", + "tactics", + "techniques", + "case-studies" + ], + "additionalProperties": false, + "$id": "atlas_matrix_schema", + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ATLAS Matrix Schema", + "definitions": { + "tactic": { + "type": "object", + "properties": { + "id": { + "$ref": "#/definitions/id_tactic" + }, + "object-type": { + "const": "tactic" + }, + "description": { + "type": "string" + }, + "name": { + "type": "string" + } + }, + "required": [ + "id", + "object-type", + "description", + "name" + ], + "additionalProperties": false + }, + "id_tactic": { + "type": "string", + "pattern": "^AML\\.TA\\d{4}$" + }, + "technique": { + "type": "object", + "properties": { + "id": { + "$ref": "#/definitions/id_technique" + }, + "object-type": { + "const": "technique" + }, + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "tactics": { + "type": "array", + "items": { + "$ref": "#/definitions/id_tactic" + } + } + }, + "required": [ + "id", + "object-type", + "name", + "description", + "tactics" + ], + "additionalProperties": false + }, + "id_technique": { + "type": "string", + "pattern": "^AML\\.T\\d{4}$" + }, + "subtechnique": { + "type": "object", + "properties": { + "id": { + "$ref": "#/definitions/id_subtechnique" + }, + "object-type": { + "const": "technique" + }, + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "subtechnique-of": { + "$ref": "#/definitions/id_technique" + } + }, + "required": [ + "id", + "object-type", + "name", + "description", + "subtechnique-of" + ], + "additionalProperties": false + }, + "id_subtechnique": { + "type": "string", + "pattern": "^AML\\.T\\d{4}\\.\\d{3}$" + }, + "case_study": { + "type": "object", + "properties": { + "id": { + "$ref": "#/definitions/id_case_study" + }, + "object-type": { + "const": "case-study" + }, + "name": { + "type": "string" + }, + "summary": { + "type": "string" + }, + "incident-date": { + "type": "string" + }, + "incident-date-granularity": { + "enum": [ + "YEAR", + "MONTH", + "DATE" + ] + }, + "procedure": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tactic": { + "$ref": "#/definitions/id_tactic" + }, + "technique": { + "anyOf": [ + { + "$ref": "#/definitions/id_technique" + }, + { + "$ref": "#/definitions/id_subtechnique" + } + ] + }, + "description": { + "type": "string" + } + }, + "required": [ + "tactic", + "technique", + "description" + ], + "additionalProperties": false + } + }, + "reported-by": { + "type": "string" + }, + "references": { + "anyOf": [ + { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": { + "anyOf": [ + { + "type": "string" + }, + { + "const": null + } + ] + }, + "url": { + "anyOf": [ + { + "type": "string" + }, + { + "const": null + } + ] + } + }, + "required": [ + "title", + "url" + ], + "additionalProperties": false + } + }, + { + "const": null + } + ] + } + }, + "required": [ + "id", + "object-type", + "name", + "summary", + "incident-date", + "incident-date-granularity", + "procedure", + "reported-by" + ], + "additionalProperties": false + }, + "id_case_study": { + "type": "string", + "pattern": "^AML\\.CS\\d{4}$" + } + } +} \ No newline at end of file diff --git a/dist/schemas/atlas_website_case_study_schema.json b/dist/schemas/atlas_website_case_study_schema.json new file mode 100644 index 0000000..1073320 --- /dev/null +++ b/dist/schemas/atlas_website_case_study_schema.json @@ -0,0 +1,147 @@ +{ + "description": "Generated on 2022-03-23", + "type": "object", + "properties": { + "study": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "summary": { + "type": "string" + }, + "incident-date": { + "type": "string", + "format": "date" + }, + "incident-date-granularity": { + "enum": [ + "YEAR", + "MONTH", + "DATE" + ] + }, + "procedure": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tactic": { + "$ref": "#/definitions/id_tactic" + }, + "technique": { + "anyOf": [ + { + "$ref": "#/definitions/id_technique" + }, + { + "$ref": "#/definitions/id_subtechnique" + } + ] + }, + "description": { + "type": "string" + } + }, + "required": [ + "tactic", + "technique", + "description" + ], + "additionalProperties": false + } + }, + "reported-by": { + "type": "string" + }, + "references": { + "anyOf": [ + { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": { + "anyOf": [ + { + "type": "string" + }, + { + "const": null + } + ] + }, + "url": { + "anyOf": [ + { + "type": "string" + }, + { + "const": null + } + ] + } + }, + "required": [ + "title", + "url" + ], + "additionalProperties": false + } + }, + { + "const": null + } + ] + }, + "id": { + "$ref": "#/definitions/id_case_study" + }, + "object-type": { + "const": "case-study" + } + }, + "required": [ + "name", + "summary", + "incident-date", + "incident-date-granularity", + "procedure", + "reported-by" + ], + "additionalProperties": false + }, + "meta": { + "type": "object", + "properties": {}, + "required": [], + "additionalProperties": true + } + }, + "required": [ + "study" + ], + "additionalProperties": false, + "$id": "atlas_website_case_study_schema", + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ATLAS Website Case Study Schema", + "definitions": { + "id_tactic": { + "type": "string", + "pattern": "^AML\\.TA\\d{4}$" + }, + "id_technique": { + "type": "string", + "pattern": "^AML\\.T\\d{4}$" + }, + "id_subtechnique": { + "type": "string", + "pattern": "^AML\\.T\\d{4}\\.\\d{3}$" + }, + "id_case_study": { + "type": "string", + "pattern": "^AML\\.CS\\d{4}$" + } + } +} \ No newline at end of file diff --git a/schemas/README.md b/schemas/README.md new file mode 100644 index 0000000..c9eedf4 --- /dev/null +++ b/schemas/README.md @@ -0,0 +1,20 @@ +# Schemas + +The project uses the [schema library](https://github.com/keleshev/schema) to define and validate its data. + +- `atlas_id.py` defines ATLAS ID regular expression patterns. +- `atlas_matrix.py` holds the schema for the `ATLAS.yaml` file. +- `atlas_obj.py` holds schemas for tactic, technique, subtechnique, and case study objects. + +## Usage + +The schemas in this directory are used as test fixures in `conftest.py`. `tests/schema_validation.py` validates each ATLAS data object. + +Additionally, JSON Schema files for `ATLAS.yaml` and website case study files are available at `dist/schemas/` for other tools to use. For example, the ATLAS website validates uploaded case study files against the case study schema file. + +### Output generation + +To re-generate JSON Schema files after modifying the schemas in this directory, run this from the project root: +``` +python -m tools.generate_schema +``` diff --git a/schemas/atlas_id.py b/schemas/atlas_id.py new file mode 100644 index 0000000..3c22686 --- /dev/null +++ b/schemas/atlas_id.py @@ -0,0 +1,31 @@ +from schema import Regex, Schema + +"""Describes ATLAS ID schemas.""" + +# Constants for ID formats +TACTIC_ID_PATTERN = r'AML\.TA\d{4}' # AML.TA0000 +TECHNIQUE_ID_PATTERN = r'AML\.T\d{4}' # AML.T0000 +SUBTECHNIQUE_ID_PATTERN = r'AML\.T\d{4}\.\d{3}' # AML.T0000.000 +CASE_STUDY_ID_PATTERN = r'AML\.CS\d{4}' # AML.CS0000 + +# Exact match patterns for the above, in Schema form +TACTIC_ID_REGEX_EXACT = Schema( + Regex(f'^{TACTIC_ID_PATTERN}$'), + name="id_tactic", + as_reference=True +) +TECHNIQUE_ID_REGEX_EXACT = Schema( + Regex(f'^{TECHNIQUE_ID_PATTERN}$'), + name="id_technique", + as_reference=True +) +SUBTECHNIQUE_ID_REGEX_EXACT = Schema( + Regex(f'^{SUBTECHNIQUE_ID_PATTERN}$'), + name="id_subtechnique", + as_reference=True +) +CASE_STUDY_ID_REGEX_EXACT = Schema( + Regex(f'^{CASE_STUDY_ID_PATTERN}$'), + name="id_case_study", + as_reference=True +) diff --git a/schemas/atlas_matrix.py b/schemas/atlas_matrix.py new file mode 100644 index 0000000..7aa516a --- /dev/null +++ b/schemas/atlas_matrix.py @@ -0,0 +1,32 @@ +from datetime import datetime +import json + +from schema import Literal, Or, Schema + +from .atlas_obj import ( + tactic_schema, + technique_schema, + subtechnique_schema, + case_study_schema +) + +"""Describes the ATLAS.yaml schema, which corresponds to data/matrix.yaml.""" + +atlas_matrix_schema = Schema( + { + "id": str, + "name": str, + "version": Or(str, int, float), + "tactics": [ + tactic_schema + ], + "techniques": [ + Or(technique_schema, subtechnique_schema) + ], + "case-studies": [ + case_study_schema + ] + }, + name='ATLAS Matrix Schema', + description=f'Generated on {datetime.now().strftime("%Y-%m-%d")}' +) diff --git a/schemas/atlas_obj.py b/schemas/atlas_obj.py new file mode 100644 index 0000000..b860fc7 --- /dev/null +++ b/schemas/atlas_obj.py @@ -0,0 +1,86 @@ +import datetime + +from schema import Or, Optional, Schema + +from .atlas_id import ( + TACTIC_ID_REGEX_EXACT, + TECHNIQUE_ID_REGEX_EXACT, + SUBTECHNIQUE_ID_REGEX_EXACT, + CASE_STUDY_ID_REGEX_EXACT +) + +"""Describes ATLAS object schemas. + +The Schema objects defined are set to be definitions referenced +by the provided name. +""" + +tactic_schema = Schema( + { + "id": TACTIC_ID_REGEX_EXACT, + "object-type": 'tactic', + "description": str, + "name": str, + }, + name="tactic", + as_reference=True +) + +technique_schema = Schema( + { + "id": TECHNIQUE_ID_REGEX_EXACT, + "object-type": "technique", + "name": str, + "description": str, + "tactics": [ + TACTIC_ID_REGEX_EXACT # List of tactic IDs + ] + }, + name="technique", + as_reference=True +) + +subtechnique_schema = Schema( + { + "id": SUBTECHNIQUE_ID_REGEX_EXACT, + "object-type": "technique", + "name": str, + "description": str, + "subtechnique-of": TECHNIQUE_ID_REGEX_EXACT # Top-level technique ID + }, + name="subtechnique", + as_reference=True +) + +case_study_schema = Schema( + { + "id": CASE_STUDY_ID_REGEX_EXACT, + "object-type": "case-study", + "name": str, + "summary": str, + "incident-date": datetime.date, + "incident-date-granularity": Or('YEAR', 'MONTH', 'DATE'), + "procedure": [ + { + "tactic": TACTIC_ID_REGEX_EXACT, + "technique": Or( + TECHNIQUE_ID_REGEX_EXACT, # top-level techniquye + SUBTECHNIQUE_ID_REGEX_EXACT # subtechnique + ), + "description": str + } + ], + "reported-by": str, + Optional("references"): Or( + [ + { + "title": Or(str, None), + "url": Or(str, None) + } + ] + , None + ) + }, + name="case_study", + as_reference=True +) \ No newline at end of file diff --git a/tests/.yamllint b/tests/.yamllint new file mode 100644 index 0000000..20f2954 --- /dev/null +++ b/tests/.yamllint @@ -0,0 +1,8 @@ +--- +extends: default + +rules: + line-length: disable + indentation: + spaces: consistent + indent-sequences: consistent diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..057e717 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,23 @@ +# Tests + +This project uses [pytest](https://docs.pytest.org/) to validate ATLAS data. + +- `conftest.py` + + Test fixtures are defined in `conftest.py` in the project root, for access to tools and schemas. + + Loads ATLAS data as constructed from `data/matrix.yaml` via `tools/create_matrix.py`. +- `tests/test_*.py` + + Current tests include schema validation, Markdown link syntax, and warnings for spelling. + + To add words to the spellcheck, edit `custom_words.txt` in this directory. +- `tests/.yamllint` holds custom [YAML lint configuration](https://yamllint.readthedocs.io/en/stable/index.html) rules. + +## Installation + +Install dependencies using: +`pip install -r tools/requirements.txt` +`pip install -r tests/requirements.txt` + +## Usage + +From the root of this project, run `pytest`. + +Additional YAML linting can be performed with `yamllint -c tests/.yamllint .` \ No newline at end of file diff --git a/tests/custom_words.txt b/tests/custom_words.txt new file mode 100644 index 0000000..5ff2c8e --- /dev/null +++ b/tests/custom_words.txt @@ -0,0 +1,85 @@ +2's +adversarially +algorithmically +antimalware +apis +apk +apks +apktool +att&ck +aws +blogposts +botnets +c&c +chatbot +classifiers +clearview +clearviewai +cleverhans +cnn +colaboratory +cve +cylance +cylance's +datasets +deepquarantine +dga +e.g. +endpoints +executables +foolbox +gpt +gpu +gpus +h5 +hdf5 +http +hyperparameters +i.e. +imagenet +implementations +interleaved +kaspersky +kaspersky's +metame +misclassification +misclassifications +misclassified +misclassify +misconfiguration +misconfigured +mitre's +ml +mlaas +model(s) +onnx +openai +optimizes +pb +perceptibility +pii +pkl +powershell +proofpoint +proofpoint's +pth +pytorch +r&d +reproducibility +rfc +robustness +roms +sharepoint +sql +systran +tay's +tensorflow +tf +tflite +uis +urlnet +urls +virustotal +workloads +workspaces +yaml diff --git a/tests/requirements.txt b/tests/requirements.txt new file mode 100644 index 0000000..2d03729 --- /dev/null +++ b/tests/requirements.txt @@ -0,0 +1,3 @@ +pyspellchecker==0.6.2 +pytest==6.2.5 +yamllint==1.26.3 diff --git a/tests/spellcheck.py b/tests/spellcheck.py new file mode 100644 index 0000000..43843ea --- /dev/null +++ b/tests/spellcheck.py @@ -0,0 +1,17 @@ +import os +from spellchecker import SpellChecker + +""" +Sets up usage of https://pyspellchecker.readthedocs.io/en/latest/. +""" + +# Add words to the spellcheck by adding to this file +custom_words_file = os.path.join(os.path.dirname(__file__), "custom_words.txt") + +# Read in list of words +with open(custom_words_file) as f: + CUSTOM_WORDS = [w.strip() for w in f.readlines()] + +# Create English spell checker with additional custom words for syntax test use +SPELL_CHECKER = SpellChecker() +SPELL_CHECKER.word_frequency.load_words(CUSTOM_WORDS) diff --git a/tests/test_schema_validation.py b/tests/test_schema_validation.py new file mode 100644 index 0000000..3fd1649 --- /dev/null +++ b/tests/test_schema_validation.py @@ -0,0 +1,55 @@ +import pytest +from schema import SchemaError, SchemaWrongKeyError + +""" +Validates ATLAS data objects against schemas defined in conftest.py. +""" + +def test_validate_matrix(matrix_schema, matrix): + """Validates the ATLAS matrix dictionary. + Explicitly fails with message to capture more in pytest short test info. + """ + try: + matrix_schema.validate(matrix) + except SchemaError as e: + pytest.fail(e.code) + +def test_validate_tactics(tactic_schema, tactics): + """Validates each tactic dictionary. + Explicitly fails with message to capture more in pytest short test info. + """ + try: + tactic_schema.validate(tactics) + except SchemaError as e: + pytest.fail(e.code) + +def test_validate_techniques(technique_schema, subtechnique_schema, techniques): + """Validates each technique dictionary, both top-level and subtechniques. + Explicitly fails with message to capture more in pytest short test info. + """ + try: + # Check if dictionary is a top-level technique + technique_schema.validate(techniques) + except (SchemaWrongKeyError, SchemaError) as e: + # Could be a subtechnique + # SchemaWrongKeyError: flagging on presence of 'subtechnique-of' + # SchemaError: flagging on ID having extra numbers at end + if e.code.startswith("Wrong key 'subtechnique-of'") or "does not match" in e.code: + try: + # Validate the subtechnique + subtechnique_schema.validate(techniques) + except SchemaError as se: + # Fail with any errors + pytest.fail(se.code) + else: + # Otherwise is another key error + pytest.fail(e.code) + +def test_validate_case_studies(case_study_schema, case_studies): + """Validates each case study dictionary. + Explicitly fails with message to capture more in pytest short test info. + """ + try: + case_study_schema.validate(case_studies) + except SchemaError as e: + pytest.fail(e.code) \ No newline at end of file diff --git a/tests/test_syntax.py b/tests/test_syntax.py new file mode 100644 index 0000000..5bb0b60 --- /dev/null +++ b/tests/test_syntax.py @@ -0,0 +1,134 @@ +import re +import warnings + +import pytest + +from schemas.atlas_id import TACTIC_ID_PATTERN, TECHNIQUE_ID_PATTERN, SUBTECHNIQUE_ID_PATTERN +from spellcheck import SPELL_CHECKER + +""" +Validates text for internal and external Markdown links and warns for spelling. +""" + +# Markdown Link syntax +# [title](url) +REGEX_MARKDOWN_LINK = re.compile(r'\[([^\[]+)\]\((.*?)\)') + +# Fully-qualified URLs +# https://stackoverflow.com/a/17773849 +REGEX_URL = re.compile(r'^(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})$') + +# Internal Markdown links, assumed to be only to /tactics/ and /techniques/ +# Note that the regex objects here are from conftest.py and are the schema library's objects, hence the pattern_str property +REGEX_INTERNAL_URL = re.compile( + rf'^/tactics/{TACTIC_ID_PATTERN}' + r'|' + rf'/techniques/{SUBTECHNIQUE_ID_PATTERN}' # Match subtechnique pattern first because top-level technique also matches this + r'|' + rf'/techniques/{TECHNIQUE_ID_PATTERN}$' + ) + +def test_markdown_link(text_with_possible_markdown_syntax): + """Validates Markdown link syntax for internal and external links. + + Assumes that external links are fully qualified, i.e. start with http(s) and other URL constraints. + Assumes that internal links are to /tactics/ and /techniques/ and match ID formats. + """ + # Text is second element in tuple of (text identifier, text) + text = text_with_possible_markdown_syntax[1] + # Find all Markdown links fitting the []() syntax + links = REGEX_MARKDOWN_LINK.findall(text) + # Track error messages + errors = [] + + # Iterate over parts of Markdown link + for title, url in links: + # Title + if not title: + # Titles should not be empty + errors.append(f'Got empty title for Markdown link with URL ({url})') + + elif '{' in title: + # Titles shouldn't contain curly brackets like in a dict (ex. if anchor typo of "anchor" instead of "anchor.name") + errors.append(f'Expected not to find the character {{ in Markdown link title, got {title}') + + # URL + if not url: + # URLs should not be empty + errors.append(f'Got empty URL for Markdown link with title [{title}]') + + elif url.startswith('http') and REGEX_URL.match(url) is None: + # Ensure that external URL is fully-qualified and doesn't contain invalid characters + errors.append(f'Expected a fully-qualified URL, got ({url})') + + elif not url.startswith('http'): + # Internal ATLAS link should match expected prefix and ID syntax + if not REGEX_INTERNAL_URL.match(url): + errors.append(f'Expected internal Markdown link URL to start with /techniques/ or /tactics/ and match ID format, got ({url})') + + if errors: + # Fail test with error messages + error_str = '\n'.join(errors) + pytest.fail(error_str) + + +# Parses out string tokens to be spell checked +REGEX_WORDS = re.compile( + r"\b" # Start at word boundary + r"(?!s)" # Excludes just "s", i.e. from a posessive + r"(?![iegUS]\.)" # Excludes i.e., e.g., U.S. + r"(?!\d+[MKB]\b)" # Excludes 70K, M, B + r"(?:" # Non capture group + r"[\w&]+" # All words, can have &, i.e. R&D + r"(?:'t)?" # Optionally include contractions + r"(?:\(s\))?" # Optionally include (s) at end + r")" + ) + +def test_spelling(text_to_be_spellchecked): + """Warns for potentially mispelled words from names and descriptions. + Only checks text outside of Markdown links. + See tests/custom_words.txt for exclusion words. + """ + # Text is second element in tuple of (text identifier, text) + text = text_to_be_spellchecked[1] + # Remove Markdown links + stripped_text = REGEX_MARKDOWN_LINK.sub('', text) + # Tokenize, see comments above at variable declaration + text_tokens = REGEX_WORDS.findall(stripped_text) + + # Get a set of potentially mispelled words + possible_mispelled = SPELL_CHECKER.unknown(text_tokens) + if possible_mispelled: + # Emit warnings + msg = 'Not recognized by spellcheck - fix or exclude in tests/custom_words.txt: ' + warnings.warn(msg + str(possible_mispelled)) + +def test_ascii(text_to_be_spellchecked): + """Warns for text containing non-ascii characters, likely from copy and pastes, + which will cause YAML output to be a literal YAML string and reduce readability. + + Example: + ’, the unicode right single quotation mark is rendered as \u2019 in a literal string, + along with explicit newline characters \n. + Replacing with ' produces a regular YAML string. + """ + # Text is second element in tuple of (text identifier, text) + text = text_to_be_spellchecked[1] + do_warn = False + try: + # Check for non-ascii text in Python 3.7+ + if not text.isascii(): + do_warn = True + except AttributeError: + # Fallback for older versions of Python + try: + text.encode('ascii') + except UnicodeEncodeError: + do_warn = True + + # Warn on non-ascii for YAML output + if do_warn: + # Potentially an unicode quote or similar + msg = f'Contains non-ascii, consider fixing. YAML output will be the literal string: {ascii(text)}' + warnings.warn(msg) diff --git a/tools/README.md b/tools/README.md new file mode 100644 index 0000000..d55de2a --- /dev/null +++ b/tools/README.md @@ -0,0 +1,29 @@ +# Tools + +Scripts to generate the distributed files and import data files. + +- `create_matrix.py` compiles the threat matrix data sources into a single standard YAML file, `ATLAS.yaml`. See more about [generating outputs from data](../data/README.md#output-generation) + +- `generate_schema.py` outputs JSON Schema files for external validation of `ATLAS.yaml` and website case study files. See more on [schema files](../schemas/README.md). + +- `import_case_study_file.py` imports case study files from the ATLAS website into ATLAS data as newly-IDed, templated files. See more about [updating case studies](../data/README.md#case-studies). + +Run each script with `-h` to see full options. + +## Development Setup + +1. Use Python 3.6+. + +2. Set up a [virtual environment](https://docs.python.org/3/library/venv.html). For example: + ``` + python3 -m venv venv + source venv/bin/activate + pip install --upgrade pip + ``` + + +3. Install dependencies for running tools scripts and tests. + ``` + pip install -r tools/requirements.txt + pip install -r tests/requirements.txt + ``` \ No newline at end of file diff --git a/tools/create_matrix.py b/tools/create_matrix.py new file mode 100644 index 0000000..c985bf7 --- /dev/null +++ b/tools/create_matrix.py @@ -0,0 +1,172 @@ +from argparse import ArgumentParser +from pathlib import Path + +from jinja2 import Environment +import yaml + +""" +Creates the combined ATLAS YAML file from source data. +""" + + +def main(): + parser = ArgumentParser() + parser.add_argument("--matrix", "-m", type=str, default="data/matrix.yaml", help="Path to matrix.yaml") + parser.add_argument("--output", "-o", type=str, default="dist", help="Output directory") + args = parser.parse_args() + + # Create output directories as needed + output_dir = Path(args.output) + output_dir.mkdir(parents=True, exist_ok=True) + + # Load and transform data + matrix = load_atlas_data(args.matrix) + + # Save composite document as a standard yaml file + output_filepath = output_dir / f"{matrix['id']}.yaml" + with open(output_filepath, "w") as f: + yaml.dump(matrix, f, default_flow_style=False, explicit_start=True, sort_keys=False) + +def load_atlas_data(matrix_yaml_filepath): + """Returns a dictionary representing ATLAS data as read from the provided YAML files.""" + # Load yaml with custom loader that supports !include and cross-doc anchors + data, anchors = load_atlas_yaml(matrix_yaml_filepath) + + ## Jinja template evaluation + + # Use YAML default style of literal string "" wrappers to handle apostophes/single quotes in the text + data_str = yaml.dump(data, default_flow_style=False, sort_keys=False, default_style='"') + # Set up data as Jinja template + env = Environment() + template = env.from_string(data_str) + # Validate template - throws a TemplateSyntaxError if invalid + env.parse(template) + + # Replace all "super aliases" in strings in the document + populated_data_str = template.render(anchors) + # Convert populated data string back to a dictionary + data = yaml.safe_load(populated_data_str) + + ## Construct output format + + # Objects are lists of lists under 'data' as !includes are list items + # Flatten the objects + objects = [object for objects in data["data"] for object in objects] + + # Organize objects into dicts by object-type + # and make sure tactics are in the order defined in the matrix + matrix = { + "id": data["id"], + "name": data["name"], + "version": data["version"], + "tactics": data["tactics"], + "techniques": [], + "case-studies": [] + } + for object in objects: + if object["object-type"] == "technique": + matrix["techniques"].append(object) + elif object["object-type"] == "tactic": + if object["id"] in matrix["tactics"]: + idx = matrix["tactics"].index(object["id"]) + matrix["tactics"][idx] = object + elif object["object-type"] == "case-study": + matrix["case-studies"].append(object) + + return matrix + +def load_atlas_yaml(matrix_yaml_filepath): + """Returns two dictionaries representing templated ATLAS data as read from the provided YAML files. + + Returns: data, anchors + data + """ + # Load yaml with custom loader that supports !include and cross-doc anchors + master = yaml.SafeLoader("") + with open(matrix_yaml_filepath, "rb") as f: + data = yaml_safe_load(f, master=master) + + # Construct anchors into dict store and for further parsing + const = yaml.constructor.SafeConstructor() + anchors = {k: const.construct_document(v) for k, v in master.anchors.items()} + + return data, anchors + +#region Support !include in YAML + +# Adapted from https://stackoverflow.com/a/44913652 + +def compose_document(self): + """Allows for cross-document anchors.""" + self.get_event() + node = self.compose_node(None, None) + self.get_event() + # self.anchors = {} # <<<< commented out + return node + +# Add functionality to SafeLoader +yaml.SafeLoader.compose_document = compose_document + +# Add !include constructor +# Adapted from http://code.activestate.com/recipes/577613-yaml-include-support/ +def yaml_include(loader, node): + """Returns a document or list of documents specified by a filepath which can contain wildcards.""" + # Process input argument + # node.value is assumed to be a relative filepath that may include wildcards + has_wildcard = '*' in node.value + # Construct path relative to current working dir + include_path = loader.input_dir_path / node.value + + # Validate inputs + if include_path.suffix not in ['.yaml', '.yml']: + # Check file extension + raise ValueError(f'Expected !include path to end in .yaml or .yml, got "{node.value}" ending in "{include_path.suffix}"') + if not has_wildcard and not include_path.exists(): + # Specified file does not exist + raise FileNotFoundError(node.value) + + # Construct outputs + # Note that both approaches, returning a self-constructed list for wildcards + # and returning a document of lists results in the same 2x nested list format + # which is why nested lists are flattened in load_atlas_data + + if has_wildcard: + # Collect documents into a single array + results = [] + # Get all matching files relative to the directory the input matrix.yaml lives in + filepaths = loader.input_dir_path.glob(node.value) + # Read in each file in name-order and append to results + for filepath in sorted(filepaths): + with open(filepath) as inputfile: + result = yaml_safe_load(inputfile, master=loader) + results.append(result) + + return results + + else: + # Return specified document + with open(include_path) as inputfile: + return yaml_safe_load(inputfile, master=loader) + +# Add custom !include constructor +yaml.add_constructor("!include", yaml_include, Loader=yaml.SafeLoader) + +def yaml_safe_load(stream, Loader=yaml.SafeLoader, master=None): + """Loads the specified file stream while preserving anchors for later use.""" + loader = Loader(stream) + # Store the input file directory for later joining with !include paths + # ex. stream.name is 'data/matrix.yaml', input_dir_path is Path('data') + # ex. stream.name is 'matrix.yaml', input_dir_path is Path('.') + loader.input_dir_path = Path(stream.name).parent + + if master is not None: + loader.anchors = master.anchors + try: + return loader.get_single_data() + finally: + loader.dispose() + +#endregion + +if __name__ == "__main__": + main() diff --git a/tools/generate_schema.py b/tools/generate_schema.py new file mode 100644 index 0000000..b9bdcfc --- /dev/null +++ b/tools/generate_schema.py @@ -0,0 +1,81 @@ +from argparse import ArgumentParser +from datetime import datetime +import json +from pathlib import Path + +from schema import Optional, Schema + +# Local directory +from schemas.atlas_matrix import atlas_matrix_schema +from schemas.atlas_obj import case_study_schema + +""" +Generates JSON Schema Draft-07 files describing ATLAS.yaml and case study files +from the ATLAS website. + +Reads from the schemas directory in this repository. + +Run this script with `python -m tools.generate_schema` to allow for local imports. +""" + +def set_optional_keys(schema_obj, keys): + """Sets the specified keys on the Schema object to Optional.""" + for key in keys: + # Set the key to be optional + schema_obj._schema[Optional(key)] = schema_obj._schema[key] + # Remove existing required key + del schema_obj._schema[key] + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument("--output", "-o", type=str, default="dist/schemas", help="Output directory") + args = parser.parse_args() + + # Create output directories as needed + output_dir = Path(args.output) + output_dir.mkdir(parents=True, exist_ok=True) + + # Overall ATLAS YAML + atlas_json_schema = atlas_matrix_schema.json_schema('atlas_matrix_schema') + output_filepath = output_dir / 'atlas_matrix_schema.json' + with open(output_filepath, 'w') as f: + json.dump(atlas_json_schema, f, indent=4) + print(f'Wrote ATLAS.yaml schema to {output_filepath}') + + # ATLAS website case study + + # Set the `id` and `object-type `fields as optional + # Case study builder files may not yet have them, but downloaded existing case studies do + set_optional_keys(case_study_schema, ['id', 'object-type']) + + # Generate JSON schema from pre-defined schema + + # The website's version of a case study file includes the case study object under the key `study` + # as well as an optional `meta` key containing date created, etc., populated upon website + # case study builder download + name = 'ATLAS Website Case Study Schema' + description = f'Generated on {datetime.now().strftime("%Y-%m-%d")}' + standalone_case_study_schema = Schema( + { + "study": case_study_schema.schema, + Optional("meta"): { + # Handle any keys and values + str: object + } + }, + name=name, + description=description) + + # Convert to JSON Schema + atlas_case_study_json_schema = standalone_case_study_schema.json_schema('atlas_website_case_study_schema') + + # Manipulate JSON to ensure incident date is a date of format YYYY-MM-DD + # Currently schema library does not output a string format + # https://json-schema.org/understanding-json-schema/reference/string.html#dates-and-times + atlas_case_study_json_schema['properties']['study']['properties']['incident-date']['format'] = 'date' + + # Output schema to file + output_filepath = output_dir / 'atlas_website_case_study_schema.json' + with open(output_filepath, 'w') as f: + json.dump(atlas_case_study_json_schema, f, indent=4) + print(f'Wrote ATLAS case study schema to {output_filepath}') diff --git a/tools/import_case_study_file.py b/tools/import_case_study_file.py new file mode 100644 index 0000000..d8fd1f7 --- /dev/null +++ b/tools/import_case_study_file.py @@ -0,0 +1,160 @@ +from argparse import ArgumentParser +from functools import partial +from pathlib import Path +import re + +import yaml + +from create_matrix import load_atlas_yaml + +""" +Imports case study files into ATLAS data as newly-IDed files. + +Case study files are those that have been downloaded from the ATLAS website's /studies/create page. + +ATLAS IDs are converted to expressions that use ATLAS YAML anchors. +""" + +# Numeric portion of an ATLAS case study ID +REGEX_CS_ID_NUM = re.compile(r'AML\.CS(\d+)') +# Match for any ATLAS tactic, technique, or subtechnique ID +REGEX_ID = re.compile(r'AML\.TA?(?:\d+)(?:\.\d+)?') +# Markdown link to a tactics or techniques page - captures title and ID part of URL +REGEX_INTERNAL_LINK = re.compile(r'\[([^\[]+)\]\(\/(?:techniques|tactics)\/(.*?)\)') +# Captures string version of 'incident-date: YYYY-MM-DD', trimming off end of fully-formatted ISO +# ex. !!timestamp "2021-11-01T00:00:00.000Z", !!timestamp "2022-02-15 02:40:33+00:00" +REGEX_INCIDENT_DATE = re.compile(r'!!timestamp "(\d{4}-\d{2}-\d{2})(?:[\d:\.+TZ ]+)?"') + +def main(): + parser = ArgumentParser('Imports case study files into ATLAS data as newly-IDed files.') + parser.add_argument("files", type=str, nargs="+", help="Path to case study file(s)") + args = parser.parse_args() + + # Construct dictionary of ATLAS IDs to anchor variable names + _, anchor2obj = load_atlas_yaml('data/matrix.yaml') + id2anchor = {obj['id']: anchor for (anchor, obj) in anchor2obj.items()} + + # Use ID-to-anchor dictionary in regex sub handlers + replace_link_anchor = partial(replace_link, id2anchor) + replace_id_anchor = partial(replace_id, id2anchor) + + # Parse and output case study files + for file in args.files: + + # Find next ATLAS ID and path to that new YAML file in data/case-studies/ + new_filepath = find_next_filepath() + new_id = new_filepath.stem + + # read_case_study_file(file, sub_id_anchor, new_filepath) + + with open(file, 'r') as f: + # Read in file + data = yaml.safe_load(f) + # Case study file data is held in 'study' key + case_study = data['study'] + + # Convert to string representation for regex + data_str = yaml.dump(case_study, default_flow_style=False, sort_keys=False, default_style='"') + + # Replace link anchors with template expressions + data_str = REGEX_INTERNAL_LINK.sub(replace_link_anchor, data_str) + # Replace IDs with template expressions + data_str = REGEX_ID.sub(replace_id_anchor, data_str) + # Trim incident dates, which may be in full ISO8601 format + data_str = REGEX_INCIDENT_DATE.sub(replace_timestamp, data_str) + + # Load back in from string representation + case_study = yaml.safe_load(data_str) + + # Strip newlines on summary + case_study['summary'] = case_study['summary'].strip() + # Strip newlines on procedure descriptions + for step in case_study['procedure']: + step['description'] = step['description'].strip() + + # Add new ID and case study object type at beginning of dict + new_case_study = { + 'id': new_id, + 'object-type': 'case-study' + } + new_case_study.update(case_study) + + # Write out new individual case study file + with open(new_filepath, 'w') as o: + yaml.dump(new_case_study, o, default_flow_style=False, explicit_start=True, sort_keys=False) + + print(f'{new_filepath} <- {file}') + + print(f'\nImported {len(args.files)} file(s) - review, run pytest for spellcheck exclusions, then run tools/create_matrix.py for ATLAS.yaml.') + +def find_next_filepath(): + """Returns a Path to a case study YAML file with next available ATLAS ID.""" + # Open output directory, assumed to be from root project dir + case_study_dir = Path('data/case-studies') + # Retrieve all YAML files and get the last file in alphabetical order + filepaths = sorted(case_study_dir.glob('*.yaml')) + # Filepath with highest ID number + latest_filepath = filepaths[-1] + + # Parse out the numeric portion of the case study ID filename + match = REGEX_CS_ID_NUM.match(latest_filepath.stem) + + if match: + # Only 1 match expected, i.e. 0015 + cur_id_num_str = match.groups()[0] + # Get next integer, i.e. 16 + next_id_num = int(cur_id_num_str) + 1 + # Padded by zeros, i.e. 0016 + next_id_num_str = '{:04d}'.format(next_id_num) + # Replace current number with the next increment + next_filepath_str = latest_filepath.as_posix().replace(cur_id_num_str, next_id_num_str) + # Return as a Path + return Path(next_filepath_str) + + # Otherwise no case study ID match + return None + +def replace_timestamp(match): + """Returns a string representation of a YAML timestamp with only the YYYY-MM-DD date portion.""" + if match: + date = match.group(1) + + return f'!!timestamp "{date}"' + + return match.group() + +def replace_id(id2anchor, match): + """Returns a string Jinja expression that accesses the id key of the anchor. + + Ex. {{anchor.id}} + """ + if match: + atlas_id = match.group() + return '{{' + id2anchor[atlas_id] + '.id}}' + + return match.group() + +def replace_link(id2anchor, match): + """Returns a string Jinja expression that creates an internal Markdown link for tactics and techniques. + + Ex. [{{anchor.name}}](/techniques/{{anchor.id}}) + """ + if match: + # Unwrap matches + full_link = match.group(0) + title = match.group(1) + atlas_id = match.group(2) + # Get anchor variable name + anchor = id2anchor[atlas_id] + + # Replace values with template expressions {{ anchor.xyz }} + # Note that double brackets evaluate to one bracket + full_link = full_link.replace(title, f'{{{{{anchor}.name}}}}') + full_link = full_link.replace(atlas_id, f'{{{{{anchor}.id}}}}') + + return full_link + + return m.group(0) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/tools/requirements.txt b/tools/requirements.txt new file mode 100644 index 0000000..6e1b0b7 --- /dev/null +++ b/tools/requirements.txt @@ -0,0 +1,5 @@ +easydict==1.9 +Jinja2==3.0.3 +python-dateutil==2.8.1 +PyYAML==5.4.1 +schema==0.7.4