From 36fac3f3a86701c39d9dfe24b9b0be014f1cd33a Mon Sep 17 00:00:00 2001 From: pyrco <105293448+pyrco@users.noreply.github.com> Date: Tue, 22 Aug 2023 12:14:44 +0200 Subject: [PATCH] Collect empty directories when the source is volatile (DIS-1931) --- acquire/acquire.py | 4 +-- acquire/collector.py | 20 +++++++----- acquire/outputs/base.py | 48 ++++++++++++++-------------- acquire/outputs/dir.py | 35 ++++++++++++++++----- acquire/outputs/tar.py | 10 ++++-- acquire/utils.py | 66 +-------------------------------------- acquire/volatilestream.py | 66 +++++++++++++++++++++++++++++++++++++++ tests/conftest.py | 9 +++--- tests/test_collector.py | 35 +++++++++++++++++++-- tests/test_outputs_dir.py | 58 ++++++++++++++++++++++++++++++++++ tests/test_outputs_tar.py | 41 ++++++++++++++++++++++++ 11 files changed, 278 insertions(+), 114 deletions(-) create mode 100644 acquire/volatilestream.py create mode 100644 tests/test_outputs_dir.py create mode 100644 tests/test_outputs_tar.py diff --git a/acquire/acquire.py b/acquire/acquire.py index 7e078685..d6d61bea 100644 --- a/acquire/acquire.py +++ b/acquire/acquire.py @@ -550,8 +550,8 @@ def _run(cls, target: Target, cli_args: argparse.Namespace, collector: Collector ) return - collector.output.write_entry(mem_dump_output_path, entry=mem_dump_path) - collector.output.write_entry(mem_dump_errors_output_path, entry=mem_dump_errors_path) + collector.output.write_entry(mem_dump_output_path, mem_dump_path) + collector.output.write_entry(mem_dump_errors_output_path, mem_dump_errors_path) collector.report.add_command_collected(cls.__name__, command_parts) mem_dump_path.unlink() mem_dump_errors_path.unlink() diff --git a/acquire/collector.py b/acquire/collector.py index a7e2f8bc..718a9fc3 100644 --- a/acquire/collector.py +++ b/acquire/collector.py @@ -230,7 +230,7 @@ def unbind(self) -> None: def close(self) -> None: self.output.close() - def _create_output_path(self, path: Path, base: Optional[str] = None) -> str: + def _get_output_path(self, path: Path, base: Optional[str] = None) -> str: base = base or self.base outpath = str(path) @@ -299,14 +299,14 @@ def collect_file( log.info("- Collecting file %s: Skipped (DEDUP)", path) return - outpath = self._create_output_path(outpath or path, base) + outpath = self._get_output_path(outpath or path, base) try: entry = path.get() if volatile: - self.output.write_volatile(outpath, entry, size) + self.output.write_volatile(outpath, entry, size=size) else: - self.output.write_entry(outpath, entry, size) + self.output.write_entry(outpath, entry, size=size) self.report.add_file_collected(module_name, path) result = "OK" @@ -322,8 +322,8 @@ def collect_file( def collect_symlink(self, path: fsutil.TargetPath, module_name: Optional[str] = None) -> None: try: - outpath = self._create_output_path(path) - self.output.write_bytes(outpath, b"", path.get(), 0) + outpath = self._get_output_path(path) + self.output.write_entry(outpath, path.get()) self.report.add_symlink_collected(module_name, path) result = "OK" @@ -359,11 +359,17 @@ def collect_dir( return seen_paths.add(resolved) + dir_is_empty = True for entry in path.iterdir(): + dir_is_empty = False self.collect_path( entry, seen_paths=seen_paths, module_name=module_name, follow=follow, volatile=volatile ) + if dir_is_empty and volatile: + outpath = self._get_output_path(path) + self.output.write_entry(outpath, path) + except OSError as error: if error.errno == errno.ENOENT: self.report.add_dir_missing(module_name, path) @@ -485,7 +491,7 @@ def collect_command_output( return def write_bytes(self, destination_path: str, data: bytes) -> None: - self.output.write_bytes(destination_path, data, None) + self.output.write_bytes(destination_path, data) self.report.add_file_collected(self.bound_module_name, destination_path) diff --git a/acquire/outputs/base.py b/acquire/outputs/base.py index 8a883108..f63639f9 100644 --- a/acquire/outputs/base.py +++ b/acquire/outputs/base.py @@ -2,22 +2,18 @@ from pathlib import Path from typing import BinaryIO, Optional, Union -from dissect.target import Target from dissect.target.filesystem import FilesystemEntry -import acquire.utils +from acquire.volatilestream import VolatileStream class Output: """Base class to implement acquire output formats with. New output formats must sub-class this class. - - Args: - target: The target that we're using acquire on. """ - def init(self, target: Target): + def init(self, path: Path, **kwargs) -> None: pass def write( @@ -27,12 +23,12 @@ def write( entry: Optional[Union[FilesystemEntry, Path]], size: Optional[int] = None, ) -> None: - """Write a filesystem entry or file-like object to the implemented output type. + """Write a file-like object to the output. Args: - output_path: The path of the entry in the output format. + output_path: The path of the entry in the output. fh: The file-like object of the entry to write. - entry: The optional filesystem entry of the entry to write. + entry: The optional filesystem entry to write. size: The optional file size in bytes of the entry to write. """ raise NotImplementedError() @@ -40,18 +36,21 @@ def write( def write_entry( self, output_path: str, - entry: Optional[Union[FilesystemEntry, Path]], + entry: Union[FilesystemEntry, Path], size: Optional[int] = None, ) -> None: - """Write a filesystem entry to the output format. + """Write a filesystem entry to the output. Args: - output_path: The path of the entry in the output format. - entry: The optional filesystem entry of the entry to write. + output_path: The path of the entry in the output. + entry: The filesystem entry to write. size: The optional file size in bytes of the entry to write. """ - with entry.open() as fh: - self.write(output_path, fh, entry, size) + if entry.is_dir() or entry.is_symlink(): + self.write_bytes(output_path, b"", entry, size=0) + else: + with entry.open() as fh: + self.write(output_path, fh, entry, size=size) def write_bytes( self, @@ -63,31 +62,32 @@ def write_bytes( """Write raw bytes to the output format. Args: - output_path: The path of the entry in the output format. + output_path: The path of the entry in the output. data: The raw bytes to write. - entry: The optional filesystem entry of the entry to write. + entry: The optional filesystem entry to write. size: The optional file size in bytes of the entry to write. """ stream = io.BytesIO(data) - self.write(output_path, stream, entry, size) + self.write(output_path, stream, entry, size=size) def write_volatile( self, output_path: str, - entry: Optional[Union[FilesystemEntry, Path]], + entry: Union[FilesystemEntry, Path], size: Optional[int] = None, ) -> None: - """Write specified path to the output format. + """Write a filesystem entry to the output. + Handles files that live in volatile filesystems. Such as procfs and sysfs. Args: - output_path: The path of the entry in the output format. - entry: The optional filesystem entry of the entry to write. + output_path: The path of the entry in the output. + entry: The filesystem entry to write. size: The optional file size in bytes of the entry to write. """ try: - fh = acquire.utils.VolatileStream(Path(entry.path)) + fh = VolatileStream(Path(entry.path)) buf = fh.read() size = size or len(buf) except (OSError, PermissionError): @@ -96,7 +96,7 @@ def write_volatile( buf = b"" size = 0 - self.write_bytes(output_path, buf, entry, size) + self.write_bytes(output_path, buf, entry, size=size) def close(self) -> None: """Closes the output.""" diff --git a/acquire/outputs/dir.py b/acquire/outputs/dir.py index 2e165067..e1d359eb 100644 --- a/acquire/outputs/dir.py +++ b/acquire/outputs/dir.py @@ -13,18 +13,39 @@ def __init__(self, path: Path, **kwargs): self.path = path def write( - self, output_path: str, fh: BinaryIO, entry: Optional[Union[FilesystemEntry, Path]], size: Optional[int] = None + self, + output_path: str, + fh: BinaryIO, + entry: Optional[Union[FilesystemEntry, Path]], + size: Optional[int] = None, ) -> None: + """Write a file-like object to a directory. + + The data from ``fh`` is written, while ``entry`` is used to get some properties of the file. + + On Windows platforms ``:`` is replaced with ``_`` in the output_path. + + Args: + output_path: The path of the entry in the output. + fh: The file-like object of the entry to write. + entry: The optional filesystem entry to write. + size: The optional file size in bytes of the entry to write. + """ if platform.system() == "Windows": output_path = output_path.replace(":", "_") - out_path = self.path.joinpath(output_path) - out_dir = out_path.parent - if not out_dir.exists(): - out_dir.mkdir(parents=True) + out_path = self.path.joinpath(output_path.lstrip("/")) + + if entry and entry.is_dir(): + out_dir = out_path + out_dir.mkdir(parents=True, exist_ok=True) + + else: + out_dir = out_path.parent + out_dir.mkdir(parents=True, exist_ok=True) - with out_path.open("wb") as fhout: - shutil.copyfileobj(fh, fhout) + with out_path.open("wb") as fhout: + shutil.copyfileobj(fh, fhout) def close(self) -> None: pass diff --git a/acquire/outputs/tar.py b/acquire/outputs/tar.py index 9f93449b..6d50ebce 100644 --- a/acquire/outputs/tar.py +++ b/acquire/outputs/tar.py @@ -52,12 +52,14 @@ def write( entry: Optional[Union[FilesystemEntry, Path]], size: Optional[int] = None, ) -> None: - """Write a filesystem entry or file-like object to a tar file. + """Write a file-like object to a tar file. + + The data from ``fh`` is written, while ``entry`` is used to get some properties of the file. Args: - output_path: The path of the entry in the output format. + output_path: The path of the entry in the output. fh: The file-like object of the entry to write. - entry: The optional filesystem entry of the entry to write. + entry: The optional filesystem entry to write. size: The optional file size in bytes of the entry to write. """ stat = None @@ -79,6 +81,8 @@ def write( if entry.is_symlink(): info.type = tarfile.SYMTYPE info.linkname = entry.readlink() + elif entry.is_dir(): + info.type = tarfile.DIRTYPE stat = entry.lstat() diff --git a/acquire/utils.py b/acquire/utils.py index 452cde71..0f70f909 100644 --- a/acquire/utils.py +++ b/acquire/utils.py @@ -4,83 +4,19 @@ import getpass import json import os -import pathlib import re import sys import textwrap import traceback from enum import Enum -from io import SEEK_SET, UnsupportedOperation from pathlib import Path -from stat import S_IRGRP, S_IROTH, S_IRUSR from typing import Any, Optional from dissect.target import Target -from dissect.util.stream import AlignedStream from acquire.outputs import OUTPUTS from acquire.uploaders.plugin_registry import UploaderRegistry -try: - # Windows systems do not have the fcntl module. - from fcntl import F_SETFL, fcntl - - HAS_FCNTL = True -except ImportError: - HAS_FCNTL = False - - -class VolatileStream(AlignedStream): - """Streaming class to handle various procfs and sysfs edge-cases. Backed by `AlignedStream`. - - Args: - path: Path of the file to obtain a file-handle from. - mode: Mode string to open the file-handle with. Such as "rt" and "rb". - flags: Flags to open the file-descriptor with. - size: The maximum size of the stream. None if unknown. - """ - - def __init__( - self, - path: Path, - mode: str = "rb", - # Windows and Darwin systems don't have O_NOATIME or O_NONBLOCK. Add them if they are available. - flags: int = (os.O_RDONLY | getattr(os, "O_NOATIME", 0) | getattr(os, "O_NONBLOCK", 0)), - size: int = 1024 * 1024 * 5, - ): - self.fh = path.open(mode) - self.fd = self.fh.fileno() - - if HAS_FCNTL: - fcntl(self.fd, F_SETFL, flags) - - st_mode = os.fstat(self.fd).st_mode - write_only = (st_mode & (S_IRUSR | S_IRGRP | S_IROTH)) == 0 # novermin - - super().__init__(0 if write_only else size) - - def seek(self, pos: int, whence: int = SEEK_SET) -> int: - raise UnsupportedOperation("VolatileStream is not seekable") - - def seekable(self) -> bool: - return False - - def _read(self, offset: int, length: int) -> bytes: - result = [] - while length: - try: - buf = os.read(self.fd, min(length, self.size - offset)) - except BlockingIOError: - break - - if not buf: - break - - result.append(buf) - offset += len(buf) - length -= len(buf) - return b"".join(result) - class StrEnum(str, Enum): """Sortable and serializible string-based enum""" @@ -400,7 +336,7 @@ def persist_execution_report(path: Path, report_data: dict) -> Path: SYSVOL_SUBST = re.compile(r"^(/\?\?/)?[cC]:") -def normalize_path(target: Target, path: pathlib.Path, resolve: bool = False) -> str: +def normalize_path(target: Target, path: Path, resolve: bool = False) -> str: if resolve: path = path.resolve() diff --git a/acquire/volatilestream.py b/acquire/volatilestream.py new file mode 100644 index 00000000..19e57f03 --- /dev/null +++ b/acquire/volatilestream.py @@ -0,0 +1,66 @@ +import os +from io import SEEK_SET, UnsupportedOperation +from pathlib import Path +from stat import S_IRGRP, S_IROTH, S_IRUSR + +from dissect.util.stream import AlignedStream + +try: + # Windows systems do not have the fcntl module. + from fcntl import F_SETFL, fcntl + + HAS_FCNTL = True +except ImportError: + HAS_FCNTL = False + + +class VolatileStream(AlignedStream): + """Streaming class to handle various procfs and sysfs edge-cases. Backed by `AlignedStream`. + + Args: + path: Path of the file to obtain a file-handle from. + mode: Mode string to open the file-handle with. Such as "rt" and "rb". + flags: Flags to open the file-descriptor with. + size: The maximum size of the stream. None if unknown. + """ + + def __init__( + self, + path: Path, + mode: str = "rb", + # Windows and Darwin systems don't have O_NOATIME or O_NONBLOCK. Add them if they are available. + flags: int = (os.O_RDONLY | getattr(os, "O_NOATIME", 0) | getattr(os, "O_NONBLOCK", 0)), + size: int = 1024 * 1024 * 5, + ): + self.fh = path.open(mode) + self.fd = self.fh.fileno() + + if HAS_FCNTL: + fcntl(self.fd, F_SETFL, flags) + + st_mode = os.fstat(self.fd).st_mode + write_only = (st_mode & (S_IRUSR | S_IRGRP | S_IROTH)) == 0 # novermin + + super().__init__(0 if write_only else size) + + def seek(self, pos: int, whence: int = SEEK_SET) -> int: + raise UnsupportedOperation("VolatileStream is not seekable") + + def seekable(self) -> bool: + return False + + def _read(self, offset: int, length: int) -> bytes: + result = [] + while length: + try: + buf = os.read(self.fd, min(length, self.size - offset)) + except BlockingIOError: + break + + if not buf: + break + + result.append(buf) + offset += len(buf) + length -= len(buf) + return b"".join(result) diff --git a/tests/conftest.py b/tests/conftest.py index d4633b09..5306a3c9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ -from unittest.mock import Mock +import io +from typing import BinaryIO import pytest from dissect.target import Target @@ -6,14 +7,14 @@ @pytest.fixture -def mock_file() -> Mock: - return Mock() +def mock_file() -> BinaryIO: + return io.BytesIO(b"Mock File") @pytest.fixture def mock_fs(mock_file) -> VirtualFilesystem: fs = VirtualFilesystem(case_sensitive=False) - fs.makedirs("/foo/bar") + fs.makedirs("/foo/bar/some-dir") fs.map_file_entry("/foo/bar/some-file", VirtualFile(fs, "some-file", mock_file)) fs.map_file_entry("/foo/bar/own-file", VirtualFile(fs, "own-file", mock_file)) fs.map_file_entry("/foo/bar/some-symlink", VirtualSymlink(fs, "some-symlink", "/foo/bar/some-file")) diff --git a/tests/test_collector.py b/tests/test_collector.py index 52ff4da0..8a0152fe 100644 --- a/tests/test_collector.py +++ b/tests/test_collector.py @@ -39,8 +39,9 @@ def test_collector() -> None: @pytest.fixture def mock_collector(mock_target) -> Collector: - collector = Collector(mock_target, Mock()) - return collector + with patch("acquire.outputs.base.Output", autospec=True) as mock_output: + collector = Collector(mock_target, mock_output) + return collector MOCK_SEEN_PATHS = set() @@ -279,6 +280,36 @@ def test_collector_collect_path_with_exception(mock_target, mock_collector, repo assert mock_log.error.call_args.args[0] == log_msg +@pytest.mark.parametrize( + "path_name, volatile, collect_path_called, write_entry_called", + [ + ("/foo/bar/some-dir", False, False, False), + ("/foo/bar/some-dir", True, False, True), + ("/foo/bar", False, True, False), + ("foo/bar", True, True, False), + ], +) +def test_collector_collect_dir( + mock_target: Target, + mock_collector: Collector, + path_name: str, + volatile: bool, + collect_path_called: bool, + write_entry_called: bool, +) -> None: + path = mock_target.fs.path(path_name) + with patch.object(mock_collector, "collect_path", autospec=True): + mock_collector.collect_dir( + path, + seen_paths=MOCK_SEEN_PATHS, + module_name=MOCK_MODULE_NAME, + follow=False, + volatile=volatile, + ) + assert mock_collector.collect_path.called == collect_path_called + assert mock_collector.output.write_entry.called == write_entry_called + + def create_temp_files(tmp_path: Path, paths: list[str]) -> None: for path in paths: creation_path = tmp_path.joinpath(path) diff --git a/tests/test_outputs_dir.py b/tests/test_outputs_dir.py new file mode 100644 index 00000000..47af3db2 --- /dev/null +++ b/tests/test_outputs_dir.py @@ -0,0 +1,58 @@ +from pathlib import Path + +import pytest +from dissect.target.filesystem import VirtualFilesystem + +from acquire.outputs import DirectoryOutput + + +@pytest.fixture +def dir_output(tmp_path: Path) -> DirectoryOutput: + tmp_path.mkdir(parents=True, exist_ok=True) + return DirectoryOutput(tmp_path) + + +def leaves(path: Path) -> list[Path]: + leave_paths = [] + + dir_is_empty = True + for path in path.iterdir(): + dir_is_empty = False + if path.is_dir(): + leave_paths.extend(leaves(path)) + else: + leave_paths.append(path) + + if dir_is_empty: + leave_paths.append(path) + + return leave_paths + + +@pytest.mark.parametrize( + "entry_name", + [ + "/foo/bar/some-file", + "/foo/bar/some-symlink", + "/foo/bar/some-dir", + ], +) +def test_dir_output_write_entry(mock_fs: VirtualFilesystem, dir_output: DirectoryOutput, entry_name: str) -> None: + entry = mock_fs.get(entry_name) + dir_output.write_entry(entry_name, entry) + dir_output.close() + + path = dir_output.path + files = leaves(path) + + assert len(files) == 1 + + file = files[0] + assert str(file)[len(str(path)) :] == entry_name + + if entry.is_dir(): + assert file.is_dir() + elif entry.is_symlink(): + assert file.is_file() + elif entry.is_file(): + assert file.is_file() diff --git a/tests/test_outputs_tar.py b/tests/test_outputs_tar.py new file mode 100644 index 00000000..652fdad1 --- /dev/null +++ b/tests/test_outputs_tar.py @@ -0,0 +1,41 @@ +import tarfile +from pathlib import Path + +import pytest +from dissect.target.filesystem import VirtualFilesystem + +from acquire.outputs import TarOutput + + +@pytest.fixture +def tar_output(tmp_path: Path) -> TarOutput: + return TarOutput(tmp_path) + + +@pytest.mark.parametrize( + "entry_name", + [ + "/foo/bar/some-file", + "/foo/bar/some-symlink", + "/foo/bar/some-dir", + ], +) +def test_tar_output_write_entry(mock_fs: VirtualFilesystem, tar_output: TarOutput, entry_name: str) -> None: + entry = mock_fs.get(entry_name) + tar_output.write_entry(entry_name, entry) + tar_output.close() + + tar_file = tarfile.open(tar_output.path) + files = tar_file.getmembers() + + assert len(files) == 1 + + file = files[0] + assert file.path == entry_name + + if entry.is_dir(): + assert file.isdir() + elif entry.is_symlink(): + assert file.issym() + elif entry.is_file(): + assert file.isfile()