From 1a457e50cc02025f15b083942b9621071d916116 Mon Sep 17 00:00:00 2001 From: Frediano Ziglio Date: Sun, 17 Mar 2024 10:37:24 +0000 Subject: [PATCH] IH-533: Remove usage of forkexecd daemon to execute processes Forkexecd was written to avoid some issues with Ocaml and multi-threading. Instead use C code to launch processes and avoid these issues. Interface remains unchanged from Ocaml side but implementation rely entirely on C code. vfork() is used to avoid performance memory issue. Reap of the processes are done directly. Code automatically reap child processes to avoid zombies. One small helper is used to better separate Ocaml and C code and handling syslog redirection. This allows to better debug in case of issues. Syslog handling is done in a separate process allowing to restart the toolstack and keep launched programs running; note that even with forkexecd daemon one process was used for this purpose. Code tries to keep compatibility with forkexecd, in particular: - SIGPIPE is ignored in the parent; - /dev/null is open with O_WRONLY even for stdin; - file descriptors are limited to 1024. We use close_range (if available) to reduce system calls to close file descriptors. Cgroup is set to avoid systemd closing processes on toolstack restart. There's a fuzzer program to check file remapping algorithm; for this reason the algorithm is in a separate file. To turn internal debug on you need to set FORKEXECD_DEBUG_LOGS C preprocessor macro to 1. Signed-off-by: Frediano Ziglio --- Makefile | 2 + ocaml/forkexecd/helper/Makefile | 35 ++ ocaml/forkexecd/helper/algo_fuzzer.c | 246 ++++++++++++++ ocaml/forkexecd/helper/close_from.c | 86 +++++ ocaml/forkexecd/helper/close_from.h | 19 ++ ocaml/forkexecd/helper/dune | 21 ++ ocaml/forkexecd/helper/logs.c | 149 +++++++++ ocaml/forkexecd/helper/logs.h | 70 ++++ ocaml/forkexecd/helper/redirect_algo.h | 210 ++++++++++++ ocaml/forkexecd/helper/syslog.c | 96 ++++++ ocaml/forkexecd/helper/syslog.h | 21 ++ ocaml/forkexecd/helper/vfork_helper.c | 433 +++++++++++++++++++++++++ ocaml/forkexecd/helper/vfork_helper.h | 23 ++ ocaml/forkexecd/lib/dune | 8 +- ocaml/forkexecd/lib/fe_stubs.c | 405 +++++++++++++++++++++++ ocaml/forkexecd/lib/forkhelpers.ml | 155 +++++++-- ocaml/forkexecd/test/dune | 2 +- ocaml/forkexecd/test/fe_test.sh | 8 +- ocaml/forkexecd/test/syslog.c | 64 ++++ ocaml/libs/stunnel/stunnel.ml | 5 - 20 files changed, 2025 insertions(+), 33 deletions(-) create mode 100644 ocaml/forkexecd/helper/Makefile create mode 100644 ocaml/forkexecd/helper/algo_fuzzer.c create mode 100644 ocaml/forkexecd/helper/close_from.c create mode 100644 ocaml/forkexecd/helper/close_from.h create mode 100644 ocaml/forkexecd/helper/dune create mode 100644 ocaml/forkexecd/helper/logs.c create mode 100644 ocaml/forkexecd/helper/logs.h create mode 100644 ocaml/forkexecd/helper/redirect_algo.h create mode 100644 ocaml/forkexecd/helper/syslog.c create mode 100644 ocaml/forkexecd/helper/syslog.h create mode 100644 ocaml/forkexecd/helper/vfork_helper.c create mode 100644 ocaml/forkexecd/helper/vfork_helper.h create mode 100644 ocaml/forkexecd/lib/fe_stubs.c diff --git a/Makefile b/Makefile index 337e4dad88c..6c5aa70c890 100644 --- a/Makefile +++ b/Makefile @@ -254,6 +254,8 @@ install: build doc sdk doc-json install -m 644 _build/default/ocaml/networkd/bin/xcp-networkd.1 $(DESTDIR)/usr/share/man/man1/xcp-networkd.1 # wsproxy install -m 755 _build/install/default/bin/wsproxy $(DESTDIR)$(LIBEXECDIR)/wsproxy +# forkexecd + install -m 755 _build/default/ocaml/forkexecd/helper/vfork_helper $(DESTDIR)/usr/libexec/xapi/vfork_helper # dune can install libraries and several other files into the right locations dune install --destdir=$(DESTDIR) --prefix=$(PREFIX) --libdir=$(LIBDIR) --mandir=$(MANDIR) \ xapi-client xapi-schema xapi-consts xapi-cli-protocol xapi-datamodel xapi-types \ diff --git a/ocaml/forkexecd/helper/Makefile b/ocaml/forkexecd/helper/Makefile new file mode 100644 index 00000000000..74fd61ec477 --- /dev/null +++ b/ocaml/forkexecd/helper/Makefile @@ -0,0 +1,35 @@ +## Set some macro but not override environment ones +CFLAGS ?= -O2 -g -Wall -Werror +LDFLAGS ?= + +all:: vfork_helper + +clean:: + rm -f vfork_helper *.o + +%.o: %.c + gcc $(CFLAGS) -c -o $@ $< + +vfork_helper: vfork_helper.o close_from.o syslog.o + gcc $(CFLAGS) $(LDFLAGS) -o $@ $^ -pthread + +close_from.o: close_from.h Makefile +syslog.o: syslog.h Makefile +vfork_helper.o: redirect_algo.h Makefile + +## Fuzzer uses AFL (American Fuzzy Lop). +## +## Use "make fuzz" to build and launch the fuzzer +## +## Use "make show" to look at the first failures (if found). + +fuzz:: + afl-gcc $(CFLAGS) -Wall -Werror -o algo_fuzzer algo_fuzzer.c + rm -rf testcase_dir + mkdir testcase_dir + echo maomaoamaoaoao > testcase_dir/test1 + rm -rf findings_dir/ + afl-fuzz -i testcase_dir -o findings_dir -D -- ./algo_fuzzer + +show:: + cat "$$(ls -1 findings_dir/default/crashes/id* | head -1)" | ./algo_fuzzer diff --git a/ocaml/forkexecd/helper/algo_fuzzer.c b/ocaml/forkexecd/helper/algo_fuzzer.c new file mode 100644 index 00000000000..946332e6839 --- /dev/null +++ b/ocaml/forkexecd/helper/algo_fuzzer.c @@ -0,0 +1,246 @@ + +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#undef NDEBUG +#define DEBUG 1 + +#if DEBUG +#define log(fmt, ...) printf(fmt "\n", ##__VA_ARGS__) +#else +#define log(fmt, ...) do {} while(0) +#endif + +// include as first file to make sure header is self container +#include "redirect_algo.h" + +#include +#include +#include +#include +#include +#include +#include + +static int fake_close(int fd); + +typedef struct { + bool open; + bool cloexec; + char *name; +} fd; + +#define NUM_FDS 4096 +static fd fds[NUM_FDS]; + +static bool +fake_close_fds_from(int fd_from) +{ + for (int fd = fd_from; fd < NUM_FDS; ++fd) + fake_close(fd); + + return true; +} + +#define O_WRONLY 1 +static int +fake_open(const char *fn, int dummy) +{ + for (int i = 0; i < NUM_FDS; ++i) + if (!fds[i].open) { + assert(fds[i].name == NULL); + fds[i].name = strdup(fn); + fds[i].open = true; + fds[i].cloexec = false; + return i; + } + assert(0); + return -1; +} + +static int +fake_close(int fd) +{ + assert(fd >= 0); + assert(fd < NUM_FDS); + if (!fds[fd].open) { + errno = EBADF; + return -1; + } + fds[fd].open = false; + free(fds[fd].name); + fds[fd].name = NULL; + return 0; +} + +static int +fake_dup2(int from, int to) +{ + assert(from >= 0 && from < NUM_FDS); + assert(to >= 0 && to < NUM_FDS); + assert(fds[from].open); + assert(from != to); + free(fds[to].name); + fds[to].open = true; + fds[to].name = strdup(fds[from].name); + fds[to].cloexec = false; + return 0; +} + +static int +fake_fcntl(int fd) +{ + assert(fd >= 0 && fd < NUM_FDS); + assert(fds[fd].open); + fds[fd].cloexec = false; + return 0; +} + +int main(int argc, char **argv) +{ + // Input where a given FD goes?? + // No, not enough, can be duplicated. + // Numbers >4096 in 2 bytes not file descriptor, + // (-1 for standard, skip for normal). + // We should add some random fds. + enum { MAX_FILE_BUF = 2048 }; + uint16_t file_buf[MAX_FILE_BUF]; + size_t read = fread(file_buf, 2, MAX_FILE_BUF, stdin); + if (read < 3) + return 0; + + static const char standard_names[][8] = { + "stdin", "stdout", "stderr" + }; + int num_mappings = 0; + uint16_t *num = file_buf; + mapping mappings[MAX_FILE_BUF]; + int i = 0; + for (i = 0; i < 3; ++i) { + mapping *m = &mappings[num_mappings++]; + m->uuid = standard_names[i]; + uint16_t n = *num++; + m->current_fd = n < NUM_FDS ? n : -1; + m->wanted_fd = i; + } + for (; i < read; ++i) { + uint16_t n = *num++; + if (n >= NUM_FDS) + continue; + + mapping *m = &mappings[num_mappings++]; + m->current_fd = n; + m->wanted_fd = -1; + char buf[64]; + sprintf(buf, "file%d", i); + m->uuid = strdup(buf); + } + if (num_mappings > MAX_TOTAL_MAPPINGS) + return 0; + + for (unsigned n = 0; n < num_mappings; ++n) { + mapping *m = &mappings[n]; + int fd = m->current_fd; + if (fd < 0) + continue; + fake_close(fd); + fds[fd].open = true; + fds[fd].name = strdup(m->uuid); + fds[fd].cloexec = true; + } + + // Check in the final file mapping all valid mappings + // have an open file descriptor. + // There should be no duplicate numbers in current_fd. + // current_fd must be in a range. + // Only if wanted_fd >= 0 current_fd can be -1. + // There should be a correspondance between input and output names. + // If current_fd was -1 it will still be -1. + // If wanted_fd >= 0 current_fd should be the same. + + fd_operation operations[MAX_OPERATIONS]; + int num_operations = + redirect_mappings(mappings, num_mappings, operations); + assert(num_operations > 0); + assert(num_operations <= MAX_OPERATIONS); + + for (int i = 0; i < num_operations; ++i) { + const fd_operation* op = &operations[i]; + log("op %d %d %d", op->fd_from, op->fd_to, op->operation); + switch (op->operation) { + case FD_OP_DUP: + if (op->fd_from == op->fd_to) + fake_fcntl(op->fd_from); + else + fake_dup2(op->fd_from, op->fd_to); + break; + case FD_OP_MOVE: + assert(op->fd_from != op->fd_to); + fake_dup2(op->fd_from, op->fd_to); + fake_close(op->fd_from); + break; + case FD_OP_DEVNULL: + // first close old, then create new one + fake_close(op->fd_to); + // TODO ideally we want read only for input for Ocaml did the same... + assert(fake_open("/dev/null", O_WRONLY) == op->fd_to); + break; + case FD_OP_CLOSE_FROM: + fake_close_fds_from(op->fd_from); + break; + default: + assert(0); + } + } + + // check files opened + for (int fd = 0; fd < NUM_FDS; ++fd) + assert(fds[fd].open == (fd < num_mappings)); + + for (int fd = 0; fd < num_mappings; ++fd) { + assert(fds[fd].cloexec == false); + log("file %d %s", fd, fds[fd].name); + } + + // Check in the final file mapping all valid mappings + // has an open file descriptor. + bool already_found[NUM_FDS] = { false, }; + for (unsigned n = 0; n < num_mappings; ++n) { + const int fd = mappings[n].current_fd; + const int wanted = mappings[n].wanted_fd; + if (fd >= 0) { + assert(fd < NUM_FDS); + assert(fds[fd].open); + + // There should be no duplicate numbers in current_fd. + assert(!already_found[fd]); + already_found[fd] = true; + } else { + // Only if wanted_fd >= 0 current_fd can be -1. + assert(mappings[n].wanted_fd >= 0); + assert(fd == -1); + } + + // If wanted_fd >= 0 current_fd should be the same. + if (wanted >= 0) + assert(wanted == fd || fd == -1); + + // current_fd must be in a range. + assert(fd >= -1); + assert(fd < num_mappings); + } + + // There should be a correspondance between input and output names. + // If current_fd was -1 it will still be -1. +} diff --git a/ocaml/forkexecd/helper/close_from.c b/ocaml/forkexecd/helper/close_from.c new file mode 100644 index 00000000000..207325d7407 --- /dev/null +++ b/ocaml/forkexecd/helper/close_from.c @@ -0,0 +1,86 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#include "close_from.h" + +#include +#include +#include +#include +#include +#include + +#ifdef __linux__ +#include +#endif + +// try to use close_range on Linux even if not defined by headers +#if defined(__linux__) && !defined(SYS_close_range) +# if defined(__alpha__) +# define SYS_close_range 546 +# elif defined(__amd64__) || defined(__x86_64__) || defined(__arm__) || \ + defined(__aarch64__) || defined(__hppa__) || defined(__i386__) || \ + defined(__ia64__) || defined(__m68k__) || defined(__mips__) || \ + defined(__powerpc__) || defined(__powerpc64__) || defined(__sparc__) || \ + defined(__s390x__) +# define SYS_close_range 436 +# endif +#endif + +bool +close_fds_from(int fd_from) +{ + // first method, use close_range +#if (defined(__linux__) && defined(SYS_close_range)) \ + || (defined(__FreeBSD__) && defined(CLOSE_RANGE_CLOEXEC)) + static bool close_range_supported = true; + if (close_range_supported) { +#if defined(__linux__) + if (syscall(SYS_close_range, fd_from, ~0U, 0) == 0) +#else + if (close_range(fd_from, ~0U, 0) == 0) +#endif + return true; + + if (errno == ENOSYS) + close_range_supported = false; + } +#endif + + // second method, read fds list from /proc + DIR *dir = opendir("/proc/self/fd"); + if (dir) { + const int dir_fd = dirfd(dir); + struct dirent *ent; + while ((ent = readdir(dir)) != NULL) { + char *end = NULL; + unsigned long fd = strtoul(ent->d_name, &end, 10); + if (end == NULL || *end) + continue; + if (fd >= fd_from && fd != dir_fd) + close(fd); + } + closedir(dir); + return true; + } + + // third method, use just a loop + struct rlimit limit; + if (getrlimit(RLIMIT_NOFILE, &limit) < 0) + return false; + for (int fd = fd_from; fd < limit.rlim_cur; ++ fd) + close(fd); + + return true; +} diff --git a/ocaml/forkexecd/helper/close_from.h b/ocaml/forkexecd/helper/close_from.h new file mode 100644 index 00000000000..7a98b9a0ecc --- /dev/null +++ b/ocaml/forkexecd/helper/close_from.h @@ -0,0 +1,19 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#pragma once + +#include + +bool close_fds_from(int fd); diff --git a/ocaml/forkexecd/helper/dune b/ocaml/forkexecd/helper/dune new file mode 100644 index 00000000000..a22fdf45ce9 --- /dev/null +++ b/ocaml/forkexecd/helper/dune @@ -0,0 +1,21 @@ +(rule + (targets vfork_helper) + (package forkexec) + (deps + close_from.h + close_from.c + logs.h + logs.c + syslog.h + syslog.c + redirect_algo.h + vfork_helper.h + vfork_helper.c + Makefile) + (action + (run make))) + +(rule + (alias install) + (deps vfork_helper) + (action (run true))) diff --git a/ocaml/forkexecd/helper/logs.c b/ocaml/forkexecd/helper/logs.c new file mode 100644 index 00000000000..0b44bbb0ea4 --- /dev/null +++ b/ocaml/forkexecd/helper/logs.c @@ -0,0 +1,149 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#include "logs.h" + +#if FORKEXECD_DEBUG_LOGS + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#define FILE_SIZE (32 * 1024) + +struct priv_mapped_logs { + uint32_t size; + + // Flags, we use characters instead of binary so + // easily see them easily with different tools. + char flags[4]; + char filename[64]; + pid_t pid; + int num; +}; + +// flags order +enum { SUCCESS, FAILURE }; + +mapped_logs mapped_logs_open(void) +{ + static int last_num = 0; + + // create a mapped file with a given size, will write header as structure + // and update using memory + mkdir("/tmp/fe_repl", 0755); + + char tmpl[] = "/tmp/fe_repl/logXXXXXX"; + int fd = mkstemp(tmpl); + if (!fd) + caml_raise_out_of_memory(); + + if (ftruncate(fd, FILE_SIZE) < 0) { + close(fd); + caml_raise_out_of_memory(); + } + + priv_mapped_logs *l = mmap(NULL, FILE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (l == MAP_FAILED) { + close(fd); + caml_raise_out_of_memory(); + } + close(fd); + + l->size = sizeof(*l); + memcpy(l->flags, "____", 4); + strncpy(l->filename, tmpl, sizeof(l->filename)); + l->pid = getpid(); + l->num = ++last_num; + + return (mapped_logs){l}; +} + +#define DEFINE_RANGE(start, end) \ + char *start = (char*) logs.priv + sizeof(priv_mapped_logs); \ + char *const end = (char*) logs.priv + FILE_SIZE + +void mapped_logs_close(mapped_logs logs) +{ + if (!logs.priv) + return; + DEFINE_RANGE(start, end); + bool written = false; + bool success = logs.priv->flags[FAILURE] == '_' && logs.priv->flags[SUCCESS] != '_'; + if (!success) { + FILE *f = fopen("/tmp/fe_repl/all_logs", "a"); + if (f) { + end[-1] = 0; + size_t len = strlen(start); + written = (fwrite(start, 1, len, f) == len); + fclose(f); + } + } + if (written || success) + unlink(logs.priv->filename); + munmap(logs.priv, FILE_SIZE); +} + +void mapped_logs_failure(mapped_logs logs) +{ + if (!logs.priv) + return; + logs.priv->flags[FAILURE] = 'F'; +} + +void mapped_logs_success(mapped_logs logs) +{ + if (!logs.priv) + return; + logs.priv->flags[SUCCESS] = 'S'; +} + +void mapped_logs_add(mapped_logs logs, const char *fmt, ...) +{ + if (!logs.priv) + return; + int save_errno = errno; + DEFINE_RANGE(start, end); + start += strlen(start); + if (start >= end -1) { + errno = save_errno; + return; // no more space + } + size_t len = end - start; + int l = snprintf(start, len, "%d:%d ", (int) logs.priv->pid, logs.priv->num); + if (l >= len) { + errno = save_errno; + return; + } + start += l; + len -= l; + va_list ap; + va_start(ap, fmt); + vsnprintf(start, len, fmt, ap); + va_end(ap); + + errno = save_errno; +} +#endif diff --git a/ocaml/forkexecd/helper/logs.h b/ocaml/forkexecd/helper/logs.h new file mode 100644 index 00000000000..13d54331f21 --- /dev/null +++ b/ocaml/forkexecd/helper/logs.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +// Definitions to write logs into memory mapped objects. +// We use a memory mapped object here because we close file descriptors +// so writing to file using them would cause logs to be lost. + +#pragma once + +#if !defined(FORKEXECD_DEBUG_LOGS) +#define FORKEXECD_DEBUG_LOGS 0 +#endif + +#if (FORKEXECD_DEBUG_LOGS) != 0 && (FORKEXECD_DEBUG_LOGS) != 1 +#error Expected FORKEXECD_DEBUG_LOGS to be defined either 0 or 1 +#endif + +typedef struct priv_mapped_logs priv_mapped_logs; +typedef struct mapped_logs mapped_logs; + +#if FORKEXECD_DEBUG_LOGS +struct mapped_logs { + priv_mapped_logs *priv; +}; +#define NULL_MAPPED_LOGS ((mapped_logs){0}) +mapped_logs mapped_logs_open(void); +void mapped_logs_close(mapped_logs logs); + +// Add a log entry, similar to printf. +void mapped_logs_add(mapped_logs logs, const char *fmt, ...); + +// Mark as failed, any failure will keep the log. +void mapped_logs_failure(mapped_logs logs); + +// Mark as successful, if successful and no failure during +// execution the log will be removed. +void mapped_logs_success(mapped_logs logs); +#else +// Use an empty structure, compiler will strip it passing +// it as a parameter without the needs to change the source +// code. +struct mapped_logs {}; +#define NULL_MAPPED_LOGS ((mapped_logs){}) +static inline mapped_logs mapped_logs_open(void) { + return (mapped_logs){}; +} + +static inline void mapped_logs_close(mapped_logs logs) { +} + +static inline void mapped_logs_failure(mapped_logs logs) { +} + +static inline void mapped_logs_success(mapped_logs logs) { +} + +#define mapped_logs_add(...) \ + do {} while(0) +#endif diff --git a/ocaml/forkexecd/helper/redirect_algo.h b/ocaml/forkexecd/helper/redirect_algo.h new file mode 100644 index 00000000000..367e5e1ec5a --- /dev/null +++ b/ocaml/forkexecd/helper/redirect_algo.h @@ -0,0 +1,210 @@ +/* Algorithm used to remap file handles before executing a process. + * The algorithm is separated in a different file in order to reuse for + * fuzzing it. + */ + +#pragma once + +#if !defined(DEBUG) +#define DEBUG 0 +#endif + +#if (DEBUG) != 0 && (DEBUG) != 1 +#error Expected DEBUG to be defined either 0 or 1 +#endif + +#ifndef log +#error Expected log macro to be defined +#endif + +#include +#include +#include + +typedef struct { + const char *uuid; + int current_fd; + int wanted_fd; +} mapping; + +typedef struct { + // source file + int fd_from; + // destination file + short fd_to; + // see FD_OP_ constants + uint8_t operation; +} fd_operation; + +typedef enum { + // Duplicate from fd_from to fd_to. + // If fd_from is the same as fd_to remove FD_CLOEXEC flag. + FD_OP_DUP, + // Duplicate from fd_from to fd_to and close fd_from. + FD_OP_MOVE, + // Open /dev/null on fd_to. + FD_OP_DEVNULL, + // Close from fd_from to the sky! + FD_OP_CLOSE_FROM, +} FD_OP; + +#define MAX_OPERATIONS 1024 +#define MAX_TOTAL_MAPPINGS (MAX_OPERATIONS - 4) + +static uint16_t remap_fds(mapping *const mappings, unsigned num_mappings, int from, int to); + +// Given the passed mappings update them (current_fd) and returns the +// requested operations to do the job. +// First 3 mappings should refer to standard file descriptors (stdin, +// stdout, stderr). +// Returns the number of operations to perform or negative if error. +static int +redirect_mappings(mapping *const mappings, const unsigned num_mappings, fd_operation *operations) +{ + mapping *const end_mappins = mappings + num_mappings; + uint16_t used_fds[MAX_OPERATIONS] = {0,}; + fd_operation *ops = operations; + +#define DUMP_MAPPINGS do { \ + if (DEBUG) { \ + for (unsigned i = 0; i < num_mappings; ++i) { \ + const mapping *m __attribute__((unused)) = &mappings[i]; \ + log("mapping %s %d %d", m->uuid, m->current_fd, m->wanted_fd); \ + } \ + char lbuf[MAX_OPERATIONS* 16]; \ + lbuf[0] = 0; \ + for (int i = 0; i < MAX_OPERATIONS; ++i) { \ + if (used_fds[i]) \ + sprintf(strchr(lbuf, 0), "%d=%d,", i, used_fds[i]); \ + } \ + log("used %s", lbuf); \ + } \ +} while(0); + + log("handle"); + + // parse all mappings + standard fds, mark ones using range 0-MAX_OPERATIONS + for (mapping *m = mappings; m < end_mappins; ++m) { + if (m->current_fd < 0 || m->current_fd >= MAX_OPERATIONS) + continue; + used_fds[m->current_fd]++; + } + DUMP_MAPPINGS; + + // Move standard file descriptors out of the way. + // Maximum 3 operations. + log("move standard fds away"); + for (mapping *m = mappings; m < end_mappins; ++m) { + const int current_fd = m->current_fd; + if (current_fd < 0 || current_fd > 2) + continue; + // find first available fd to use + int fd = 3; + while (used_fds[fd]) + ++fd; + *ops++ = (fd_operation){ current_fd, fd, FD_OP_DUP }; + uint16_t changed = remap_fds(mappings, num_mappings, current_fd, fd); + log("changed %d from %d to %d", changed, current_fd, fd); + used_fds[current_fd] = 0; + used_fds[fd] = changed; + } + DUMP_MAPPINGS; + + // Move standard fds into proper positions + // Maximum 3 operations (standard fds to be moved). + log("move standard fds correctly"); + for (mapping *m = mappings; m < end_mappins; ++m) { + const int current_fd = m->current_fd; + if (current_fd < 0 || m->wanted_fd < 0) + continue; + int fd = m->wanted_fd; + FD_OP op = FD_OP_DUP; + if (current_fd >= num_mappings) { + // move + op = FD_OP_MOVE; + uint16_t changed = remap_fds(mappings, num_mappings, current_fd, fd); + log("changed %d from %d to %d", changed, current_fd, fd); + used_fds[fd] = changed; + } else { + // duplicate + m->current_fd = fd; + if (--used_fds[current_fd] == 0) + op = FD_OP_MOVE; + used_fds[fd] = 1; + } + *ops++ = (fd_operation){ current_fd, fd, op }; + } + DUMP_MAPPINGS; + + // Remove cloexec on range [3, 3 + num mappings). + // Maximum no standard mappings operations. + log("remove cloexec flags"); + for (int fd = 3; fd < num_mappings; ++fd) { + if (!used_fds[fd]) + continue; + log("remove cloexec from %d", fd); + *ops++ = (fd_operation){ fd, fd, FD_OP_DUP }; + } + DUMP_MAPPINGS; + + // Move all fds left in range [3, 3 + num mappings). + // Maximum no standard mapping operations; then sum with + // the above is the no standard mapping operations. + log("move all fds left in range"); + int last_free = 3; + for (mapping *m = mappings; m < end_mappins; ++m) { + const int current_fd = m->current_fd; + if (m->wanted_fd >= 0) + continue; + if (current_fd < num_mappings && used_fds[current_fd] == 1) + continue; + while (used_fds[last_free]) + ++last_free; + int fd = last_free; + // TODO copied from above + FD_OP op = FD_OP_DUP; + if (current_fd >= num_mappings) { + // move + op = FD_OP_MOVE; + uint16_t changed = remap_fds(mappings, num_mappings, current_fd, fd); + log("changed %d from %d to %d", changed, current_fd, fd); + used_fds[fd] = changed; + } else { + // duplicate + m->current_fd = fd; + if (--used_fds[current_fd] == 0) + op = FD_OP_MOVE; + used_fds[fd] = 1; + } + *ops++ = (fd_operation){ current_fd, fd, op }; + } + DUMP_MAPPINGS; + + // Close extra fds. + *ops++ = (fd_operation){ num_mappings, 0, FD_OP_CLOSE_FROM }; + + // Create missing standard fds. + // Maximum standard mapping operations, but not the above, + // so the sum with move the standard is 3. + for (int fd = 0; fd < 3; ++fd) { + if (used_fds[fd]) + continue; + *ops++ = (fd_operation){ fd, fd, FD_OP_DEVNULL }; + } + + return ops - operations; +} + +static uint16_t +remap_fds(mapping *const mappings, unsigned num_mappings, int from, int to) +{ + uint16_t res = 0; + for (unsigned i = 0; i < num_mappings; ++i) { + mapping *m = &mappings[i]; + if (m->current_fd == from) { + m->current_fd = to; + res++; + } + } + return res; +} diff --git a/ocaml/forkexecd/helper/syslog.c b/ocaml/forkexecd/helper/syslog.c new file mode 100644 index 00000000000..ddfa50b66d6 --- /dev/null +++ b/ocaml/forkexecd/helper/syslog.c @@ -0,0 +1,96 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include "syslog.h" + +#include +#include +#include + +static inline size_t quoted_length(const char c) +{ + return c == '\\' ? 2 : + (c >= ' ' && c < 0x7f) ? 1 : + 4; +} + +static const char hex[] = "0123456789ABCDEF"; + +static inline void write_quoted(char *const p, const char c) +{ + if (c == '\\') { + p[0] = p[1] = c; + } else if (c >= ' ' && c < 0x7f) { + p[0] = c; + } else { + p[0] = '\\'; + p[1] = 'x'; + p[2] = hex[(c>>4)&0xf]; + p[3] = hex[c&0xf]; + } +} + +static void syslog_line(const char *line, const char *key, int child_pid) +{ + syslog(LOG_DAEMON|LOG_INFO, "%s[%d]: %s", key, child_pid, line); +} + +// Quote and forward every line from "fd" to the syslog. +// "fd" will be closed. +bool forward_to_syslog(int fd, const char *key, int child_pid) +{ +#define syslog_line(line) syslog_line(line, key, child_pid) + FILE *f = fdopen(fd, "r"); + char quoted_buf[64000]; + char *dest = quoted_buf; + char *const dest_end = quoted_buf + sizeof(quoted_buf) - sizeof(" ...") - 1; + bool overflowed = false; + while (true) { + int ch = getc_unlocked(f); + + if (!overflowed && dest != quoted_buf && (ch == '\n' || ch == EOF)) { + *dest = 0; + syslog_line(quoted_buf); + } + + if (ch == EOF) { + bool res = !!feof(f); + fclose(f); + return res; + } + + if (ch == '\n') { + overflowed = false; + dest = quoted_buf; + continue; + } + + if (overflowed) + continue; + + const size_t quoted_len = quoted_length(ch); + if (dest + quoted_len >= dest_end) { + strcpy(dest, " ..."); + syslog_line(quoted_buf); + overflowed = true; + continue; + } + write_quoted(dest, ch); + dest += quoted_len; + } +} diff --git a/ocaml/forkexecd/helper/syslog.h b/ocaml/forkexecd/helper/syslog.h new file mode 100644 index 00000000000..8ca172b6324 --- /dev/null +++ b/ocaml/forkexecd/helper/syslog.h @@ -0,0 +1,21 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#pragma once + +#include + +// Quote and forward every line from "fd" to the syslog. +// "fd" will be closed. +bool forward_to_syslog(int fd, const char *key, int child_pid); diff --git a/ocaml/forkexecd/helper/vfork_helper.c b/ocaml/forkexecd/helper/vfork_helper.c new file mode 100644 index 00000000000..39ea01399ad --- /dev/null +++ b/ocaml/forkexecd/helper/vfork_helper.c @@ -0,0 +1,433 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "close_from.h" +#include "syslog.h" +#include "logs.h" +#include "vfork_helper.h" + +#define log(...) do {} while(0) +#include "redirect_algo.h" + +typedef struct { + char **args; + mapping *mappings; + fd_operation operations[MAX_OPERATIONS]; + int err; + const char *err_func; +} exec_info; + +static void adjust_args(char **args, mapping *const mappings, unsigned num_mappings); +static void reset_signal_handlers(void); +static void clear_cgroup(void); +static const char *get_arg(int *argc, char ***argv); +static int get_fd(int *argc, char ***argv); +static void error(int err, const char *msg, ...); +static void init_syslog(const char *key, bool redirect_stderr_to_stdout); + +static int error_fd = -1; + +int main(int argc, char **argv) +{ + unsigned num_mappings = 3; + bool redirect_stderr_to_stdout = false; + const char *key = NULL; + struct rlimit nofile_limit; + mapping mappings_buf[MAX_TOTAL_MAPPINGS]; + exec_info info[1] = { NULL, }; + const char *directory = "/"; + + mapped_logs logs = mapped_logs_open(); +#undef log +#define log(fmt, ...) mapped_logs_add(logs, fmt "\n", ## __VA_ARGS__) +#define log_fail(fmt, ...) do {\ + mapped_logs_failure(logs); \ + mapped_logs_add(logs, fmt "\n", ## __VA_ARGS__); \ +} while(0) + + log("starting"); + + info->mappings = mappings_buf; + for (int i = 0; i < 3; ++i) { + mapping *const m = &info->mappings[i]; + m->uuid = NULL; + m->current_fd = -1; + m->wanted_fd = i; + } + + // Scan all arguments, check them and collect some information. + ++argv; + --argc; + for (;;) { + // we must have an argument left + const char *arg = get_arg(&argc, &argv); + + // next must be a single letter option + if (arg[0] != '-' || arg[1] == 0 || arg[2] != 0) + error(EINVAL, "Invalid option %s", arg); + + // final "--" + if (arg[1] == '-') + break; + + switch (arg[1]) { + case 'I': // stdin + info->mappings[0].current_fd = get_fd(&argc, &argv); + break; + case 'O': // stdout + info->mappings[1].current_fd = get_fd(&argc, &argv); + break; + case 'E': // stderr + info->mappings[2].current_fd = get_fd(&argc, &argv); + break; + case 'm': { // mapping + if (num_mappings >= MAX_TOTAL_MAPPINGS) { + log_fail("too many mappings"); + mapped_logs_close(logs); + error(EINVAL, "Too many mappings"); + } + const char *uuid = get_arg(&argc, &argv); + if (strlen(uuid) != 36) { + log_fail("invalid mapping"); + mapped_logs_close(logs); + error(EINVAL, "Invalid mapping UUID"); + } + const int fd = get_fd(&argc, &argv); + mapping* const m = &info->mappings[num_mappings++]; + m->uuid = uuid; + m->current_fd = fd; + m->wanted_fd = -1; + } + break; + case 's': // syslog (with key) + key = get_arg(&argc, &argv); + break; + case 'S': // syslog stderr to stdout + redirect_stderr_to_stdout = true; + break; + case 'd': + directory = get_arg(&argc, &argv); + break; + case 'e': { // error file descriptor + error_fd = get_fd(&argc, &argv); + if (num_mappings >= MAX_TOTAL_MAPPINGS) { + log_fail("too many mappings"); + mapped_logs_close(logs); + error(EINVAL, "Too many mappings"); + } + mapping* const m = &info->mappings[num_mappings++]; + m->uuid = NULL; + m->current_fd = error_fd; + m->wanted_fd = -1; + } + break; + default: + error(EINVAL, "Invalid option %s", arg); + } + } + + if (argc < 1) { + log_fail("no args"); + mapped_logs_close(logs); + error(EINVAL, "No command arguments"); + } + + info->args = argv; + + if (getrlimit(RLIMIT_NOFILE, &nofile_limit) < 0) { + int err = errno; + log_fail("getrlimit error"); + mapped_logs_close(logs); + error(err, "getrlimit"); + } + + sigset_t sigset; + + // Compute the file operations we need to do for the file mappings + int num_operations = + redirect_mappings(info->mappings, num_mappings, info->operations); + + if (FORKEXECD_DEBUG_LOGS) { + for (size_t n = 0; info->args[n]; ++n) + log("arg %zd %s", n, info->args[n]); + } + + // Rename all command line. + adjust_args(info->args, info->mappings, num_mappings); + + if (FORKEXECD_DEBUG_LOGS) { + for (size_t n = 0; info->args[n]; ++n) + log("arg %zd %s", n, info->args[n]); + } + + reset_signal_handlers(); + + if (strcmp(directory, ".") != 0 && chdir(directory) < 0) { + int err = errno; + log_fail("chdir %d", err); + error(err, "chdir"); + } + + // Clear cgroup otherwise systemd will shutdown processes if + // toolstack is restarted. + clear_cgroup(); + + if (setsid() < 0) { + int err = errno; + log_fail("setsid %d", errno); + error(err, "setsid"); + } + + // Redirect file descriptors. + int err = 0; + const char *err_func = NULL; + for (int i = 0; i < num_operations && err == 0; ++i) { + const fd_operation* const op = &info->operations[i]; + log("op %d %d %d", op->fd_from, op->fd_to, op->operation); + switch (op->operation) { + case FD_OP_DUP: + if (op->fd_from == op->fd_to) { + // These file descriptors came from another process, + // so surely they have the CLOEXEC flag set, nothing + // to do. + break; + } else { + err_func = "dup2"; + if (dup2(op->fd_from, op->fd_to) < 0) + err = errno; + if (op->fd_from == error_fd) + error_fd = op->fd_to; + } + break; + case FD_OP_MOVE: + err_func = "dup2"; + if (dup2(op->fd_from, op->fd_to) < 0) + err = errno; + if (op->fd_from == error_fd) + error_fd = op->fd_to; + close(op->fd_from); + break; + case FD_OP_DEVNULL: + // first close old, then create new one + close(op->fd_to); + // TODO ideally we want read only for input for Ocaml did the same... + err_func = "open"; + if (open("/dev/null", O_WRONLY) != op->fd_to) + err = errno ? errno : EBADF; + break; + case FD_OP_CLOSE_FROM: + close_fds_from(op->fd_from); + break; + default: + err_func = "safe_exec"; + err = EINVAL; + } + } + if (err != 0) { + info->err = err; + info->err_func = err_func; + log_fail("redirect error %d in %s", err, err_func); + error(err, "%s", err_func); + } + + if (key) + init_syslog(key, redirect_stderr_to_stdout); + + // Limit number of files limits to standard limit to avoid + // creating bugs with old programs. + if (nofile_limit.rlim_cur > 1024) { + nofile_limit.rlim_cur = 1024; + setrlimit(RLIMIT_NOFILE, &nofile_limit); + } + + // Reset signal mask, inherited by the process we are going to execute + sigemptyset(&sigset); + pthread_sigmask(SIG_SETMASK, &sigset, NULL); + + log("execv..."); + mapped_logs_success(logs); + if (error_fd >= 0) + close(error_fd); + execv(info->args[0], info->args); + log_fail("execve failed %d", errno); + // Here we could set err and err_func but we kept compatibility + // with forkexecd daemon. + exit(errno == ENOENT ? 127 : 126); +} + +static void +adjust_args(char **args, mapping *const mappings, unsigned num_mappings) +{ + for (; *args; ++args) { + char *arg = *args; + size_t len = strlen(arg); + if (len < 36) + continue; + + // replace uuid with file descriptor + char *uuid = arg + len - 36; + for (unsigned i = 0; i < num_mappings; ++i) { + const mapping *m = &mappings[i]; + if (m->uuid == NULL || strcmp(m->uuid, uuid) != 0) + continue; + sprintf(uuid, "%d", m->current_fd); + } + } +} + +static void +reset_signal_handlers(void) +{ + for (int sig = 1; sig < NSIG; ++sig) { + // these signals can't be overridden + if (sig == SIGKILL || sig == SIGSTOP) + continue; + + // Set signal dispositions. + // This avoids inherit unwanted overrides. + // Also prevent handling unwanted signal handler, especially using vfork(). + // Use ignore SIGPIPE for compatibility with forkexecd. + signal(sig, sig == SIGPIPE ? SIG_IGN : SIG_DFL); + } +} + +static void +clear_cgroup(void) +{ + int fd = open("/sys/fs/cgroup/systemd/cgroup.procs", O_WRONLY|O_CLOEXEC); + if (fd >= 0) { + char string_pid[32]; + int ignored __attribute__((unused)); + sprintf(string_pid, "%d\n", (int) getpid()); + ignored = write(fd, string_pid, strlen(string_pid)); + close(fd); + } +} + +static const char * +get_arg(int *argc, char ***argv) +{ + if (*argc < 0) + error(EINVAL, "Expected one more argument"); + + const char *arg = **argv; + --(*argc); + ++(*argv); + return arg; +} + +static int +get_fd(int *argc, char ***argv) +{ + const char *arg = get_arg(argc, argv); + unsigned long fd = strtoul(arg, NULL, 0); + if (fd < 0 || fd > INT_MAX) + error(EINVAL, "Expected valid file descriptor number"); + return (int) fd; +} + +static void +error(int err, const char *format, ...) +{ + if (error_fd >= 0) { + msg_t msg = { err }; + va_list ap; + va_start(ap, format); + int ignored __attribute__((unused)); + vsnprintf(msg.msg_buf, sizeof(msg.msg_buf), format, ap); + msg.msg_buf[sizeof(msg.msg_buf) - 1] = 0; + va_end(ap); + ignored = write(error_fd, &msg, offsetof(msg_t, msg_buf) + strlen(msg.msg_buf) + 1); + } + exit(125); +} + +static void +init_syslog(const char *key, bool redirect_stderr_to_stdout) +{ + int fds[2]; + if (pipe(fds) < 0) + error(errno, "pipe"); + dup2(fds[1], 1); + if (redirect_stderr_to_stdout) + dup2(fds[1], 2); + close(fds[1]); + + const int child_pid = (int) getpid(); + + pid_t pid = fork(); + if (pid < 0) + error(errno, "fork"); + + if (pid == 0) { + // child + close(0); + close(1); + if (open("/dev/null", O_RDONLY) != 0 + || open("/dev/null", O_WRONLY) != 1) + error(errno, "open"); + dup2(1, 2); + if (fds[0] != 3) { + dup2(fds[0], 3); + fds[0] = 3; + } + close_fds_from(4); + + pid = fork(); + if (pid < 0) + error(errno, "fork"); + if (pid > 0) + // parent + exit(0); + + openlog("forkexecd", 0, LOG_DAEMON); + forward_to_syslog(fds[0], key, child_pid); + exit(0); + } + + close(fds[0]); + + // parent + int status; + wait(&status); + if (!WIFEXITED(status)) + error(EPIPE, "syslogger"); + + switch (WEXITSTATUS(status)) { + case 0: + // success + return; + case 125: + // forward error, a proper message will be forwarded + exit(125); + } + error(EPIPE, "syslogger"); +} diff --git a/ocaml/forkexecd/helper/vfork_helper.h b/ocaml/forkexecd/helper/vfork_helper.h new file mode 100644 index 00000000000..08fd1ff21af --- /dev/null +++ b/ocaml/forkexecd/helper/vfork_helper.h @@ -0,0 +1,23 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#pragma once + +// Common structure to pass errors from helper to library +typedef struct { + // numeric C error + int err; + // message + char msg_buf[1000]; +} msg_t; diff --git a/ocaml/forkexecd/lib/dune b/ocaml/forkexecd/lib/dune index 749f173b977..b4e02786e67 100644 --- a/ocaml/forkexecd/lib/dune +++ b/ocaml/forkexecd/lib/dune @@ -15,4 +15,10 @@ xapi-stdext-unix xapi-tracing ) - (preprocess (per_module ((pps ppx_deriving_rpc) Fe)))) + (preprocess (per_module ((pps ppx_deriving_rpc) Fe))) + (foreign_stubs + (language c) + (names fe_stubs) + (include_dirs ../helper) + (flags :standard -Wall -Werror) + )) diff --git a/ocaml/forkexecd/lib/fe_stubs.c b/ocaml/forkexecd/lib/fe_stubs.c new file mode 100644 index 00000000000..07ae3906551 --- /dev/null +++ b/ocaml/forkexecd/lib/fe_stubs.c @@ -0,0 +1,405 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "../helper/vfork_helper.h" + +#define FOREACH_LIST(name, list) \ + for(value name = (list); name != Val_emptylist; name = Field(name, 1)) + +// Create thread reducing stack usage to a minimum to reduce memory usage. +// Returns error number (like pthread_create). +static int create_thread_minstack(pthread_t *th, void *(*proc)(void *), void *arg); + +static inline void +reap_pid(pid_t pid) +{ + int status; + while (waitpid(pid, &status, 0) < 0 && errno == EINTR) + continue; +} + +static void * +thread_proc_reap(void *arg) +{ + pid_t pid = (pid_t) (intptr_t) arg; + + reap_pid(pid); + + return NULL; +} + +// Appends a string to *p_dest buffer. +// It updates *p_dest to point after copied string. +// Returns copied string. +static char * +append_string(char **p_dest, const char *s) +{ + char *const dest = *p_dest; + size_t const size = strlen(s) + 1; + memcpy(dest, s, size); + *p_dest = dest + size; + return dest; +} + +static char** +copy_string_list(value list) +{ + size_t strings_size = 0; + size_t list_size = 0; + char **res, **ptrs; + char *strings; + + FOREACH_LIST(item, list) { + strings_size += strlen(String_val(Field(item, 0))) + 1; + ++list_size; + } + + res = (char **) malloc(sizeof(char*) * (list_size + 1) + strings_size); + if (!res) + return NULL; + + ptrs = res; + strings = (char *) (res + (list_size + 1)); + FOREACH_LIST(item, list) + *ptrs++ = append_string(&strings, String_val(Field(item, 0))); + *ptrs = NULL; + + return res; +} + +static void +close_fd(int *const p_fd) +{ + const int fd = *p_fd; + if (fd >= 0) { + *p_fd = -1; + close(fd); + } +} + +typedef struct { + const char *err_msg; + pid_t pid; + msg_t msg; +} safe_exec_result; + +static int +safe_exec_with_helper(safe_exec_result *res, char **args, char **envs) +{ + int err = EINVAL; + char fd_string[48]; + int pipe_fds[2] = { -1, -1 }; + + res->err_msg = "safe_exec"; + + if (!args[0] || !args[1] || !args[2]) + return EINVAL; + + if (strcmp(args[1], "-e") == 0) { + if (pipe(pipe_fds) < 0) { + res->err_msg = "pipe"; + return errno; + } + sprintf(fd_string, "%d", pipe_fds[1]); + args[2] = fd_string; + } + + sigset_t sigset, old_sigset; + int cancellation_state; + + // Disable cancellation to avoid some signals. + // Glibc use some signals to handle thread cancellation. + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancellation_state); + + // Block all possible signals to avoid receiving some in the child. + // Signal mask is inherited to new process/thread will start with + // all signals disabled and we can safely change them. + sigfillset(&sigset); + pthread_sigmask(SIG_BLOCK, &sigset, &old_sigset); + + // fork + err = 0; + res->pid = vfork(); + if (res->pid < 0) { + err = errno; + } else if (res->pid == 0) { + // child + if (pipe_fds[0] >= 0) + close(pipe_fds[0]); + execve(args[0], args, envs); + // keep compatibility with forkexecd daemon. + _exit(errno == ENOENT ? 127 : 126); + } + + // Restore thread state + pthread_sigmask(SIG_SETMASK, &old_sigset, NULL); + pthread_setcancelstate(cancellation_state, NULL); + + // We don't need writing pipe anymore and we need to detect + // if closed so we can't keep it open + close_fd(&pipe_fds[1]); + + if (err != 0) { + close_fd(&pipe_fds[0]); + res->err_msg = "vfork"; + return err; + } + + // Handle errors from helper + if (pipe_fds[0] >= 0) { + int readed; + // Note that buffer is small and written atomically by + // the helper, no reason for the kernel to split it. + while ((readed = read(pipe_fds[0], &res->msg, sizeof(res->msg))) < 0 + && errno == EINTR) + continue; + close_fd(&pipe_fds[0]); + if (readed != 0 && readed < offsetof(msg_t, msg_buf) + 1) { + // This should never happen !!! + // At this point the process is created and we have a pid so + // we cannot just return an error. + // We could try to wait the process but it should fail, let + // returns success and let caller read process status result. + return 0; + } + res->msg.msg_buf[sizeof(res->msg.msg_buf) - 1] = 0; + if (readed > 0) { + // Wait the process otherwise we'll have a zombie + reap_pid(res->pid); + + res->err_msg = res->msg.msg_buf; + return res->msg.err; + } + } + return 0; +} + +CAMLprim value +caml_safe_exec_with_helper(value args, value environment) +{ + CAMLparam2(args, environment); + + // Copy parameters to C + char **c_args = copy_string_list(args); + char **c_envs = copy_string_list(environment); + if (!c_envs || !c_args) { + free(c_envs); + free(c_args); + caml_raise_out_of_memory(); + } + + // potentially slow section, release Ocaml engine + caml_enter_blocking_section(); + + safe_exec_result res; + int err = safe_exec_with_helper(&res, c_args, c_envs); + + free(c_envs); + free(c_args); + + caml_leave_blocking_section(); + + // error, notify with an exception + if (err != 0) + unix_error(err, res.err_msg, Nothing); + + CAMLreturn(Val_int(res.pid)); +} + +CAMLprim value +caml_pidwaiter_dontwait(value pid_val) +{ + CAMLparam1(pid_val); + pid_t pid = Int_val(pid_val); + + // reap the pid to avoid zombies + pthread_t th; + if (create_thread_minstack(&th, thread_proc_reap, (void *) (intptr_t) pid) == 0) + pthread_detach(th); + + CAMLreturn(Val_unit); +} + +typedef struct { + pid_t pid; + bool timed_out; + bool stop; + struct timespec deadline; + pthread_mutex_t mtx; + pthread_cond_t cond; +} timeout_kill; + +static void * +thread_proc_timeout_kill(void *arg) +{ + timeout_kill *tm = (timeout_kill *) arg; + + pthread_mutex_lock(&tm->mtx); + int res = tm->stop ? 0: + pthread_cond_timedwait(&tm->cond, &tm->mtx, &tm->deadline); + pthread_mutex_unlock(&tm->mtx); + + if (res == ETIMEDOUT) { + kill(tm->pid, SIGKILL); + tm->timed_out = true; + } + return NULL; +} + +static int +create_thread_minstack(pthread_t *th, void *(*proc)(void *), void *arg) +{ + int res; + + // disable any possible signal handler so we can safely use a small stack + // for the thread + sigset_t sigset, old_sigset; + sigfillset(&sigset); + pthread_sigmask(SIG_BLOCK, &sigset, &old_sigset); + + pthread_attr_t th_attr; + res = pthread_attr_init(&th_attr); + if (res) + return res; + pthread_attr_setstacksize(&th_attr, PTHREAD_STACK_MIN); + + res = pthread_create(th, &th_attr, proc, arg); + + pthread_attr_destroy(&th_attr); + pthread_sigmask(SIG_SETMASK, &old_sigset, NULL); + + return res; +} + +/* + * Wait a process with a given timeout. + * At the end of timeout (if trigger) kill the process. + * To avoid race we need to wait a specific process, but this is blocking + * and we use a timeout to implement the wait. Timer functions are per + * process, not per thread. + * Returns <0 if error, 0 if not timed out, >0 if timedout. + */ +static int +wait_process_timeout(pid_t pid, double timeout) +{ + int err; + + // compute deadline + timeout_kill tm = { pid, false, false }; + if (clock_gettime(CLOCK_MONOTONIC, &tm.deadline) < 0) + return -errno; + + double f = floor(timeout); + tm.deadline.tv_sec += f; + tm.deadline.tv_nsec += (timeout - f) * 1000000000.; + if (tm.deadline.tv_nsec >= 1000000000) { + tm.deadline.tv_nsec -= 1000000000; + tm.deadline.tv_sec += 1; + } + + pthread_condattr_t attr; + err = pthread_condattr_init(&attr); + if (err) + return -err; + err = pthread_condattr_setclock(&attr, CLOCK_MONOTONIC); + if (!err) + err = pthread_cond_init(&tm.cond, &attr); + pthread_condattr_destroy(&attr); + if (err) + return -err; + + err = pthread_mutex_init(&tm.mtx, NULL); + if (err) { + pthread_cond_destroy(&tm.cond); + return -err; + } + + // Create timeout thread + pthread_t th; + err = create_thread_minstack(&th, thread_proc_timeout_kill, &tm); + if (err) { + pthread_cond_destroy(&tm.cond); + pthread_mutex_destroy(&tm.mtx); + return -err; + } + + // Wait the process, we avoid to reap the other process to avoid + // race conditions. Consider: + // - process exit; + // - we reap the thread; + // - OS reuse the pid; + // - timeout thread terminate the pid, now reused. + // Avoiding reaping the process will create a zombie process so + // the KILL would be directed to that. + siginfo_t info; + waitid(P_PID, pid, &info, WEXITED|WNOWAIT); + + // Close the timeout thread + pthread_mutex_lock(&tm.mtx); + // We use also a variable to avoid races like + // - we create the thread; + // - we start waiting the process which was already exited; + // - we came here trying to close the thread; + // - thread waits for signal. + tm.stop = true; + pthread_cond_broadcast(&tm.cond); + pthread_mutex_unlock(&tm.mtx); + pthread_join(th, NULL); + + // Cleanup + pthread_cond_destroy(&tm.cond); + pthread_mutex_destroy(&tm.mtx); + + return tm.timed_out ? 1 : 0; +} + +CAMLprim value +caml_pidwaiter_waitpid(value timeout_value, value pid_value) +{ + CAMLparam0(); + double timeout = timeout_value == Val_none ? 0 : Double_val(Some_val(timeout_value)); + pid_t pid = Int_val(pid_value); + + caml_enter_blocking_section(); + + bool timed_out = false; + int err = 0; + if (timeout > 0) { + int res = wait_process_timeout(pid, timeout); + if (res < 0) + err = -res; + else if (res != 0) + timed_out = true; + } + + caml_leave_blocking_section(); + + if (err) + unix_error(err, "waitpid", Nothing); + + CAMLreturn(timed_out ? Val_true: Val_false); +} diff --git a/ocaml/forkexecd/lib/forkhelpers.ml b/ocaml/forkexecd/lib/forkhelpers.ml index 7b7fc0b2247..9dc065e7ff9 100644 --- a/ocaml/forkexecd/lib/forkhelpers.ml +++ b/ocaml/forkexecd/lib/forkhelpers.ml @@ -40,12 +40,32 @@ let with_tracing ~tracing ~name f = Tracing.with_tracing ~parent:tracing ~name f let finally = Xapi_stdext_pervasives.Pervasiveext.finally -type pidty = Unix.file_descr * int +let use_daemon = ref false + +module FEStubs = struct + external safe_exec_with_helper : string list -> string list -> int + = "caml_safe_exec_with_helper" + + (* timeout <= 0 wait infinite *) + external pidwaiter_waitpid : ?timeout:float -> int -> bool + = "caml_pidwaiter_waitpid" + + (* do not wait for a process, release it, it won't generate a zombie process *) + external pidwaiter_dontwait : int -> unit = "caml_pidwaiter_dontwait" +end + +type waiter = Pidwaiter | Sock of Unix.file_descr + +type pidty = waiter * int (* The forking executioner has been used, therefore we need to tell *it* to waitpid *) -let string_of_pidty (fd, pid) = - Printf.sprintf "(FEFork (%d,%d))" (Fd_send_recv.int_of_fd fd) pid +let string_of_pidty (waiter, pid) = + match waiter with + | Pidwaiter -> + Printf.sprintf "(FEFork (%d))" pid + | Sock fd -> + Printf.sprintf "(FEFork (%d,%d))" (Fd_send_recv.int_of_fd fd) pid exception Subprocess_failed of int @@ -53,7 +73,7 @@ exception Subprocess_killed of int exception Subprocess_timeout -let waitpid (sock, pid) = +let waitpid_daemon sock pid = let status = Fecomms.read_raw_rpc sock in Unix.close sock ; match status with @@ -79,7 +99,7 @@ let waitpid (sock, pid) = (* [waitpid_nohang] reports the status of a socket to a process. The intention is to make this non-blocking. If the process is finished, the socket is closed and not otherwise. *) -let waitpid_nohang (sock, pid) = +let waitpid_nohang_daemon sock pid = let verbose = false in if verbose then D.debug "%s pid=%d" __FUNCTION__ pid ; let fail fmt = Printf.ksprintf failwith fmt in @@ -118,7 +138,7 @@ let waitpid_nohang (sock, pid) = fail "%s: error happened when trying to read the status. %s" __FUNCTION__ (Printexc.to_string exn) -let dontwaitpid (sock, _pid) = +let dontwaitpid_daemon sock _pid = ( try (* Try to tell the child fe that we're not going to wait for it. If the other end of the pipe has been closed then this doesn't matter, as this @@ -128,6 +148,27 @@ let dontwaitpid (sock, _pid) = ) ; Unix.close sock +let waitpid (waiter, pid) = + match waiter with + | Pidwaiter -> + Unix.waitpid [] pid + | Sock sock -> + waitpid_daemon sock pid + +let waitpid_nohang (waiter, pid) = + match waiter with + | Pidwaiter -> + Unix.waitpid [Unix.WNOHANG] pid + | Sock sock -> + waitpid_nohang_daemon sock pid + +let dontwaitpid (waiter, pid) = + match waiter with + | Pidwaiter -> + FEStubs.pidwaiter_dontwait pid + | Sock sock -> + dontwaitpid_daemon sock pid + let waitpid_fail_if_bad_exit ty = let _, status = waitpid ty in match status with @@ -140,7 +181,7 @@ let waitpid_fail_if_bad_exit ty = | Unix.WSTOPPED n -> raise (Subprocess_killed n) -let getpid (_sock, pid) = pid +let getpid (_waiter, pid) = pid type 'a result = Success of string * 'a | Failure of string * exn @@ -179,12 +220,9 @@ type syslog_stdout = | Syslog_DefaultKey | Syslog_WithKey of string -(** Safe function which forks a command, closing all fds except a whitelist and - having performed some fd operations in the child *) -let safe_close_and_exec ?tracing ?env stdin stdout stderr +let safe_close_and_exec_daemon ?tracing env stdin stdout stderr (fds : (string * Unix.file_descr) list) ?(syslog_stdout = NoSyslogging) - ?(redirect_stderr_to_stdout = false) (cmd : string) (args : string list) = - with_tracing ~tracing ~name:__FUNCTION__ @@ fun tracing -> + ?(redirect_stderr_to_stdout = false) args = let sock = Fecomms.open_unix_domain_sock_client ?tracing (Filename.concat runtime_path "/xapi/forker/main") @@ -228,7 +266,6 @@ let safe_close_and_exec ?tracing ?env stdin stdout stderr List.fold_left maybe_add_id_to_fd_map dest_named_fds predefined_fds in - let env = Option.value ~default:default_path_env_pair env in let syslog_stdout = match syslog_stdout with | NoSyslogging -> @@ -241,7 +278,7 @@ let safe_close_and_exec ?tracing ?env stdin stdout stderr Fecomms.write_raw_rpc ?tracing sock (Fe.Setup { - Fe.cmdargs= cmd :: args + Fe.cmdargs= args ; env= Array.to_list env ; id_to_fd_map ; syslog_stdout @@ -295,7 +332,7 @@ let safe_close_and_exec ?tracing ?env stdin stdout stderr match Fecomms.read_raw_rpc ?tracing sock with | Ok (Fe.Execed pid) -> remove_fd_from_close_list sock ; - (sock, pid) + (Sock sock, pid) | Ok status -> let msg = Printf.sprintf @@ -314,6 +351,64 @@ let safe_close_and_exec ?tracing ?env stdin stdout stderr ) close_fds +let safe_close_and_exec_vfork ?tracing env stdin stdout stderr + (fds : (string * Unix.file_descr) list) ?(syslog_stdout = NoSyslogging) + ?(redirect_stderr_to_stdout = false) cmd args = + let string_of_fd (fd : Unix.file_descr) = string_of_int (Obj.magic fd) in + let args = "--" :: args in + let args = if redirect_stderr_to_stdout then "-S" :: args else args in + let args = + match syslog_stdout with + | NoSyslogging -> + args + | Syslog_DefaultKey -> + "-s" :: Filename.basename cmd :: args + | Syslog_WithKey key -> + "-s" :: key :: args + in + let args = + List.fold_right + (fun (uuid, fd) args -> + Unix.clear_close_on_exec fd ; + "-m" :: uuid :: string_of_fd fd :: args + ) + fds args + in + let add_std args arg fd = + match fd with + | Some fd -> + Unix.clear_close_on_exec fd ; + arg :: string_of_fd fd :: args + | None -> + args + in + let args = add_std args "-E" stderr in + let args = add_std args "-O" stdout in + let args = add_std args "-I" stdin in + let args = "/usr/libexec/xapi/vfork_helper" :: "-e" :: "DUMMY" :: args in + (* Convert environment and add tracing variables. *) + let env = + List.append (Tracing.EnvHelpers.of_span tracing) (Array.to_list env) + in + let pid = FEStubs.safe_exec_with_helper args env in + (Pidwaiter, pid) + +(** Safe function which forks a command, closing all fds except a whitelist and + having performed some fd operations in the child *) +let safe_close_and_exec ?tracing ?env stdin stdout stderr + (fds : (string * Unix.file_descr) list) ?(syslog_stdout = NoSyslogging) + ?(redirect_stderr_to_stdout = false) (cmd : string) (args : string list) = + with_tracing ~tracing ~name:__FUNCTION__ @@ fun tracing -> + let args = cmd :: args in + let env = Option.value ~default:default_path_env_pair env in + + if not !use_daemon then (* Build a list of arguments as helper wants. *) + safe_close_and_exec_vfork ?tracing env stdin stdout stderr fds + ~syslog_stdout ~redirect_stderr_to_stdout cmd args + else + safe_close_and_exec_daemon ?tracing env stdin stdout stderr fds + ~syslog_stdout ~redirect_stderr_to_stdout args + let execute_command_get_output_inner ?tracing ?env ?stdin ?(syslog_stdout = NoSyslogging) ?(redirect_stderr_to_stdout = false) ?(timeout = -1.0) cmd args = @@ -342,7 +437,7 @@ let execute_command_get_output_inner ?tracing ?env ?stdin with_tracing ~tracing ~name:"Forkhelpers.with_logfile_err_fd" @@ fun tracing -> with_logfile_fd "execute_command_get_err" (fun err_fd -> - let sock, pid = + let waiter, pid = safe_close_and_exec ?tracing ?env (Option.map (fun (_, fd, _) -> fd) stdinandpipes) (Some out_fd) (Some err_fd) [] ~syslog_stdout @@ -354,14 +449,26 @@ let execute_command_get_output_inner ?tracing ?env ?stdin close wr ) stdinandpipes ; - if timeout > 0. then - Unix.setsockopt_float sock Unix.SO_RCVTIMEO timeout ; - with_tracing ~tracing ~name:"Forkhelpers.waitpid" @@ fun _ -> - try waitpid (sock, pid) - with Unix.(Unix_error ((EAGAIN | EWOULDBLOCK), _, _)) -> - Unix.kill pid Sys.sigkill ; - ignore (waitpid (sock, pid)) ; - raise Subprocess_timeout + match waiter with + | Pidwaiter -> + with_tracing ~tracing ~name:"Forkhelpers.waitpid" + @@ fun _ -> + let timedout = FEStubs.pidwaiter_waitpid ~timeout pid in + let res = Unix.waitpid [] pid in + + if timedout then raise Subprocess_timeout ; + res + | Sock sock -> ( + if timeout > 0. then + Unix.setsockopt_float sock Unix.SO_RCVTIMEO timeout ; + with_tracing ~tracing ~name:"Forkhelpers.waitpid" + @@ fun _ -> + try waitpid_daemon sock pid + with Unix.(Unix_error ((EAGAIN | EWOULDBLOCK), _, _)) -> + Unix.kill pid Sys.sigkill ; + ignore (waitpid_daemon sock pid) ; + raise Subprocess_timeout + ) ) ) with diff --git a/ocaml/forkexecd/test/dune b/ocaml/forkexecd/test/dune index b190bed81aa..4e57ff2804e 100644 --- a/ocaml/forkexecd/test/dune +++ b/ocaml/forkexecd/test/dune @@ -13,6 +13,6 @@ (rule (alias runtest) (package xapi-forkexecd) - (deps fe_test.sh fe_test.exe ../src/fe_main.exe syslog.so) + (deps fe_test.sh fe_test.exe ../src/fe_main.exe syslog.so ../helper/vfork_helper) (action (run ./fe_test.sh))) diff --git a/ocaml/forkexecd/test/fe_test.sh b/ocaml/forkexecd/test/fe_test.sh index fe454e89802..05d855f85e7 100755 --- a/ocaml/forkexecd/test/fe_test.sh +++ b/ocaml/forkexecd/test/fe_test.sh @@ -8,7 +8,9 @@ export FE_TEST=1 SOCKET=${XDG_RUNTIME_DIR}/xapi/forker/main rm -f "$SOCKET" -LD_PRELOAD="$PWD/syslog.so" ../src/fe_main.exe & +LD_PRELOAD="$PWD/syslog.so" \ +TEST_VFORK_HELPER="$PWD/../helper/vfork_helper" \ +../src/fe_main.exe & MAIN=$! cleanup () { kill $MAIN @@ -17,4 +19,6 @@ trap cleanup EXIT INT for _ in $(seq 1 10); do test -S ${SOCKET} || sleep 1 done -echo "" | LD_PRELOAD="$PWD/syslog.so" ./fe_test.exe 16 +echo "" | LD_PRELOAD="$PWD/syslog.so" \ +TEST_VFORK_HELPER="$PWD/../helper/vfork_helper" \ +./fe_test.exe 16 diff --git a/ocaml/forkexecd/test/syslog.c b/ocaml/forkexecd/test/syslog.c index 2316e84a25e..10e3dc3c79f 100644 --- a/ocaml/forkexecd/test/syslog.c +++ b/ocaml/forkexecd/test/syslog.c @@ -18,6 +18,23 @@ if (!old_func) \ old_func = (typeof(name) *) dlsym(RTLD_NEXT, #name); +#define strlcpy _strlcpy +static size_t +strlcpy(char *dest, const char *src, size_t len) +{ + size_t l = strlen(src); + + if (len) { + --len; + if (l <= len) + len = l; + + memcpy(dest, src, len); + dest[len] = 0; + } + return l; +} + int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen) { static const char dev_log[] = "/dev/log"; @@ -119,3 +136,50 @@ void __vsyslog_chk(int priority, int flags, const char *format, va_list ap) { vsyslog_internal(priority, format, ap); } + +static char vfork_helper[256] = "/usr/libexec/xapi/vfork_helper"; +static char ld_preload[512]; + +static const char ld_prefix[] = "LD_PRELOAD="; +enum { len_prefix = sizeof(ld_prefix) - 1 }; + +__attribute__((constructor)) +static void initialize(void) +{ + const char *env; + env = getenv("TEST_VFORK_HELPER"); + if (env) + strlcpy(vfork_helper, env, sizeof(vfork_helper)); + env = getenv("LD_PRELOAD"); + if (env) { + strcpy(ld_preload, ld_prefix); + strlcpy(ld_preload + len_prefix, env, sizeof(ld_preload) - len_prefix); + } +} + +int execve(const char *pathname, char *const argv[], char *const envp[]) +{ + START(execve); + + if (strcmp(pathname, "/usr/libexec/xapi/vfork_helper") == 0) + pathname = vfork_helper; + + if (envp && ld_preload[0]) { + bool ok = false; + size_t num_env = 0; + for (char * const *e = envp; *e; ++e) { + ++num_env; + if (strncmp(*e, ld_prefix, len_prefix) == 0) + ok = true; + } + if (!ok) { + // allocate on stack, we could be inside a vfork() created process + char **new_envs = alloca(sizeof(char*) * (num_env + 2)); + *new_envs = ld_preload; + memcpy(new_envs + 1, envp, sizeof(char*) * (num_env + 1)); + envp = new_envs; + } + } + + return old_func(pathname, argv, envp); +} diff --git a/ocaml/libs/stunnel/stunnel.ml b/ocaml/libs/stunnel/stunnel.ml index 8d319b4b80d..c7fb013ff54 100644 --- a/ocaml/libs/stunnel/stunnel.ml +++ b/ocaml/libs/stunnel/stunnel.ml @@ -110,11 +110,6 @@ type pid = | FEFork of Forkhelpers.pidty (** the forkhelpers module did it for us. *) | Nopid -(* let string_of_pid = function - | StdFork x -> Printf.sprintf "(StdFork %d)" x - | FEFork x -> Forkhelpers.string_of_pidty x - | Nopid -> "None" *) - let getpid ty = match ty with | StdFork pid ->