Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mount namespaces to linux sandbox #45

Merged
merged 19 commits into from
Sep 28, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ name = "fs"
path = "tests/fs.rs"
harness = false

[[test]]
name = "fs_without_landlock"
path = "tests/fs_without_landlock.rs"
harness = false

[[test]]
name = "full_env"
path = "tests/full_env.rs"
Expand Down
179 changes: 70 additions & 109 deletions src/linux/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
//! This module implements sandboxing on Linux based on the Landlock LSM,
//! namespaces, and seccomp.

use std::fs;
use std::io::Error as IoError;
use std::collections::HashMap;
use std::path::PathBuf;

use bitflags::bitflags;
use landlock::{
make_bitflags, Access, AccessFs, Compatible, PathBeneath, PathFd, Ruleset, RulesetAttr,
RulesetCreated, RulesetCreatedAttr, RulesetStatus, ABI as LANDLOCK_ABI,
Expand All @@ -16,19 +15,42 @@ use crate::error::{Error, Result};
use crate::linux::seccomp::NetworkFilter;
use crate::{Exception, Sandbox};

mod namespaces;
mod seccomp;

/// Minimum landlock ABI version.
const ABI: LANDLOCK_ABI = LANDLOCK_ABI::V1;

/// Linux sandboxing.
pub struct LinuxSandbox {
bind_mounts: HashMap<PathBuf, libc::c_ulong>,
cd-work marked this conversation as resolved.
Show resolved Hide resolved
env_exceptions: Vec<String>,
landlock: RulesetCreated,
allow_networking: bool,
full_env: bool,
}

impl LinuxSandbox {
/// Add or modify a bind mount.
///
/// This will add a new bind mount with the specified permission if it does
/// not exist already.
///
/// If the bind mount already exists, it will *ADD* the additional
/// permissions.
fn update_bind_mount(&mut self, path: PathBuf, write: bool, execute: bool) {
let flags = self.bind_mounts.entry(path).or_insert(libc::MS_RDONLY | libc::MS_NOEXEC);

if write {
*flags &= !libc::MS_RDONLY;
}

if execute {
*flags &= !libc::MS_NOEXEC;
}
}
}

impl Sandbox for LinuxSandbox {
fn new() -> Result<Self> {
// Setup landlock filtering.
Expand All @@ -38,14 +60,38 @@ impl Sandbox for LinuxSandbox {
.create()?;
landlock.as_mut().set_no_new_privs(true);

Ok(Self { landlock, env_exceptions: Vec::new(), allow_networking: false, full_env: false })
Ok(Self {
landlock,
allow_networking: false,
full_env: false,
env_exceptions: Default::default(),
bind_mounts: Default::default(),
})
}

fn add_exception(&mut self, exception: Exception) -> Result<&mut Self> {
let (path, access) = match exception {
Exception::Read(path) => (path, make_bitflags!(AccessFs::{ ReadFile | ReadDir })),
Exception::Write(path) => (path, AccessFs::from_write(ABI)),
Exception::ExecuteAndRead(path) => (path, AccessFs::from_read(ABI)),
let (path_fd, access) = match exception {
Exception::Read(path) => {
let path_fd = PathFd::new(&path)?;

self.update_bind_mount(path, false, false);

(path_fd, make_bitflags!(AccessFs::{ ReadFile | ReadDir }))
},
Exception::Write(path) => {
let path_fd = PathFd::new(&path)?;

self.update_bind_mount(path, true, false);

(path_fd, AccessFs::from_write(ABI))
},
Exception::ExecuteAndRead(path) => {
let path_fd = PathFd::new(&path)?;

self.update_bind_mount(path, false, true);

(path_fd, AccessFs::from_read(ABI))
},
Exception::Environment(key) => {
self.env_exceptions.push(key);
return Ok(self);
Expand All @@ -60,7 +106,7 @@ impl Sandbox for LinuxSandbox {
},
};

let rule = PathBeneath::new(PathFd::new(path)?, access);
let rule = PathBeneath::new(path_fd, access);

self.landlock.as_mut().add_rule(rule)?;

Expand All @@ -73,12 +119,23 @@ impl Sandbox for LinuxSandbox {
crate::restrict_env_variables(&self.env_exceptions);
}

// Clear abstract namespace by entering a new user namespace.
let _ = create_user_namespace(false);
// Setup namespaces.
let namespace_result =
namespaces::create_namespaces(self.allow_networking, self.bind_mounts);

// Create network namespace.
// Setup seccomp network filter.
if !self.allow_networking {
restrict_networking()?;
let seccomp_result = NetworkFilter::apply();

// Propagate failure if neither seccomp nor namespaces could isolate networking.
if let (Err(_), Err(err)) = (&namespace_result, seccomp_result) {
return Err(err);
}
}

// Use landlock only if namespaces failed.
if namespace_result.is_ok() {
return Ok(());
}

// Apply landlock rules.
Expand All @@ -92,99 +149,3 @@ impl Sandbox for LinuxSandbox {
}
}
}

/// Restrict networking using seccomp and namespaces.
fn restrict_networking() -> Result<()> {
// Create network namespace.
let result = create_user_namespace(true).and_then(|_| unshare(Namespaces::NETWORK));

// Apply seccomp network filter.
let seccomp_result = NetworkFilter::apply();
result.or(seccomp_result)
}

/// Create a new user namespace.
///
/// If the `become_root` flag is set, then the current user will be mapped to
/// UID 0 inside the namespace. Otherwise the current user will be mapped to its
/// UID of the parent namespace.
fn create_user_namespace(become_root: bool) -> Result<()> {
// Get the current UID/GID.
let uid = unsafe { libc::geteuid() };
let gid = unsafe { libc::getegid() };

// Create the namespace.
unshare(Namespaces::USER)?;

// Map the UID and GID.
let uid_map = if become_root { format!("0 {uid} 1\n") } else { format!("{uid} {uid} 1\n") };
let gid_map = if become_root { format!("0 {gid} 1\n") } else { format!("{gid} {gid} 1\n") };
fs::write("/proc/self/uid_map", uid_map.as_bytes())?;
fs::write("/proc/self/setgroups", b"deny")?;
fs::write("/proc/self/gid_map", gid_map.as_bytes())?;

Ok(())
}

/// Enter a namespace.
fn unshare(namespaces: Namespaces) -> Result<()> {
let result = unsafe { libc::unshare(namespaces.bits()) };
if result == 0 {
Ok(())
} else {
Err(IoError::last_os_error().into())
}
}

bitflags! {
/// Unshare system call namespace flags.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
struct Namespaces: libc::c_int {
/// Unshare the file descriptor table, so that the calling process no longer
/// shares its file descriptors with any other process.
const FILES = libc::CLONE_FILES;
/// Unshare filesystem attributes, so that the calling process no longer shares
/// its root directory, current directory, or umask attributes with any other process.
const FS = libc::CLONE_FS;
/// Unshare the cgroup namespace.
const CGROUP = libc::CLONE_NEWCGROUP;
/// Unshare the IPC namespace, so that the calling process has a private copy of
/// the IPC namespace which is not shared with any other process. Specifying
/// this flag automatically implies [`Namespaces::SYSVSEM`] as well.
const IPC = libc::CLONE_NEWIPC;
/// Unshare the network namespace, so that the calling process is moved into a
/// new network namespace which is not shared with any previously existing process.
const NETWORK = libc::CLONE_NEWNET;
/// Unshare the mount namespace, so that the calling process has a private copy
/// of its namespace which is not shared with any other process. Specifying this
/// flag automatically implies [`Namespaces::FS`] as well.
const MOUNT = libc::CLONE_NEWNS;
/// Unshare the PID namespace, so that the calling process has a new PID
/// namespace for its children which is not shared with any previously existing
/// process. The calling process is **not** moved into the new namespace. The
/// first child created by the calling process will have the process ID 1 and
/// will assume the role of init in the new namespace. Specifying this flag
/// automatically implies [`libc::CLONE_THREAD`] as well.
const PID = libc::CLONE_NEWPID;
/// Unshare the time namespace, so that the calling process has a new time
/// namespace for its children which is not shared with any previously existing
/// process. The calling process is **not** moved into the new namespace.
const TIME = 0x80;
/// Unshare the user namespace, so that the calling process is moved into a new
/// user namespace which is not shared with any previously existing process. The
/// caller obtains a full set of capabilities in the new namespace.
///
/// Requires that the calling process is not threaded; specifying this flag
/// automatically implies [`libc::CLONE_THREAD`] and [`Namespaces::FS`] as well.
const USER = libc::CLONE_NEWUSER;
/// Unshare the UTS IPC namespace, so that the calling process has a private
/// copy of the UTS namespace which is not shared with any other process.
const UTS = libc::CLONE_NEWUTS;
/// Unshare System V semaphore adjustment (semadj) values, so that the calling
/// process has a new empty semadj list that is not shared with any other
/// process. If this is the last process that has a reference to the process's
/// current semadj list, then the adjustments in that list are applied to the
/// corresponding semaphores
const SYSVSEM = libc::CLONE_SYSVSEM;
}
}
Loading
Loading