#![allow(clippy::undocumented_unsafe_blocks)]

use std::{
    env,
    io::stdin,
    mem,
    os::{
        fd::{AsRawFd, FromRawFd, OwnedFd, RawFd},
        unix::ffi::OsStrExt,
    },
    ptr,
};

use btoi::btoi;
use memchr::arch::all::is_prefix;
use nix::{
    errno::Errno,
    libc,
    sys::{
        resource::{setrlimit, Resource},
        signal::{kill, sigprocmask, SigSet, SigmaskHow, Signal},
    },
    unistd::{execvp, getpid, read, setpgid, setsid, tcsetpgrp, write, Pid},
};

use crate::{
    caps::Capability,
    compat::{set_name, set_pdeathsig},
    config::*,
    confine::{confine_scmp_ioctl, confine_scmp_kptr, confine_scmp_pwritev2, safe_drop_cap},
    err::err2set,
    fd::close,
    ignore_signal,
    landlock::{CompatLevel, Compatible, RestrictSelfFlags, Ruleset, RulesetAttr, Scope},
    retry::retry_on_eintr,
    unshare::{config::Config, error::ErrorCode as Err, run::ChildInfo},
};

unsafe fn fail_errno(code: Err, errno: i32) -> ! {
    let msg = match code {
        Err::CapSet => c"syd: capset error".as_ptr(),
        Err::Exec => c"syd: exec error".as_ptr(),
        Err::ParentDeathSignal => c"syd: parent-death-signal error".as_ptr(),
        Err::PreExec => c"syd: pre-exec error".as_ptr(),
        Err::ProcessStop => c"syd: error stopping process".as_ptr(),
        Err::IgnoreSignal => c"syd: error ignoring signals".as_ptr(),
        Err::ResetSignal => c"syd: error resetting signals".as_ptr(),
        Err::SetResourceLimits => c"syd: error setting resource limits".as_ptr(),
        Err::LandlockFilterScopedSignals => c"syd: error scoping signals with landlock".as_ptr(),
        Err::Seccomp => c"syd: seccomp error".as_ptr(),
        Err::SeccompFilterIoctl => c"syd: seccomp filter ioctl error".as_ptr(),
        Err::SeccompFilterAppendOnly => c"syd: seccomp filter pwritev2 error".as_ptr(),
        Err::SeccompFilterKptr => c"syd: seccomp filter kernel pointer error".as_ptr(),
        Err::SeccompSendFd => c"syd: seccomp send notify-fd error".as_ptr(),
        Err::SeccompWaitFd => c"syd: seccomp wait for notify-fd error".as_ptr(),
        Err::SetSid => c"syd: setsid error".as_ptr(),
        Err::SetPty => c"syd: error setting pty as controlling terminal".as_ptr(),
        Err::DupPty => c"syd: error duplicating pty onto stdio fds".as_ptr(),
        Err::SetPgid => c"syd: error creating new process group".as_ptr(),
        Err::SetPgrp => c"syd: error setting foreground process group".as_ptr(),
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        Err::SetTSC => c"syd: set-tsc error".as_ptr(),
    };
    Errno::set_raw(errno);
    libc::perror(msg as *const libc::c_char);
    libc::_exit(errno);
}

macro_rules! fail_safe {
    ($child:expr, $error:expr) => {
        let errno = Errno::last_raw();
        unsafe { fail_errno($error, errno) }
    };
}

macro_rules! fail_errno_safe {
    ($child:expr, $error:expr, $errno:expr) => {
        unsafe { fail_errno($error, $errno) }
    };
}

#[expect(clippy::cognitive_complexity)]
pub fn child_after_clone(arg: *mut libc::c_void) -> ! {
    // Set process name, ignore errors.
    let _ = set_name(c"syd_exec");

    // SAFETY: arg is a valid ChildInfo structure.
    let mut child: Box<ChildInfo> = unsafe { Box::from_raw(arg as *mut ChildInfo) };

    if let Some(&sig) = child.cfg.death_sig.as_ref() {
        if let Err(errno) = set_pdeathsig(Some(sig)) {
            fail_errno_safe!(child, Err::ParentDeathSignal, errno as i32);
        }
    }

    // Restriction 0: Change controlling terminal to PTY as necessary.
    if let Some(pty_fd) = child.pty_fd.take() {
        // SAFETY: pty_fd is a valid FD.
        let pty_fd = unsafe { OwnedFd::from_raw_fd(pty_fd) };

        // Become session leader so we can take a controlling TTY.
        if let Err(errno) = setsid() {
            fail_errno_safe!(child, Err::SetSid, errno as i32);
        }

        // Make the PTY fd our controlling terminal.
        if let Err(errno) =
            Errno::result(unsafe { libc::ioctl(pty_fd.as_raw_fd(), libc::TIOCSCTTY, 0) })
        {
            fail_errno_safe!(child, Err::SetPty, errno as i32);
        }

        // Make us the foreground process group.
        if let Err(errno) = tcsetpgrp(&pty_fd, getpid()) {
            fail_errno_safe!(child, Err::SetPgrp, errno as i32);
        }

        // Duplicate PTY fd onto stdio(3) fds.
        for std_fd in [libc::STDIN_FILENO, libc::STDOUT_FILENO, libc::STDERR_FILENO] {
            if let Err(errno) = Errno::result(unsafe { libc::dup2(pty_fd.as_raw_fd(), std_fd) }) {
                fail_errno_safe!(child, Err::DupPty, errno as i32);
            }
        }

        // Close the original PTY fd.
        drop(pty_fd);
    } else if child.cfg.make_group_leader {
        // Put the sandbox process into a new process group.
        if let Err(errno) = setpgid(Pid::from_raw(0), Pid::from_raw(0)) {
            fail_errno_safe!(child, Err::SetPgid, errno as i32);
        }

        // Ignore SIGTTOU to avoid tcsetpgrp side effect.
        // This will be restored in the next block.
        if let Err(errno) = ignore_signal(Signal::SIGTTOU) {
            fail_errno_safe!(child, Err::IgnoreSignal, errno as i32);
        }

        // Set sandbox process foreground process (best effort).
        let _ = tcsetpgrp(stdin(), getpid());
    }

    // This must happen after ^^PTY handling above,
    // because we want to unignore SIGTTOU.
    if child.cfg.restore_sigmask {
        // Reset blocking signals.
        // Step 1: Reset the signal mask using pthread_sigmask.
        unsafe {
            let mut sigmask: libc::sigset_t = mem::zeroed();
            libc::sigemptyset(&raw mut sigmask);
            libc::pthread_sigmask(libc::SIG_SETMASK, &raw const sigmask, ptr::null_mut());
        }
        // Step 2: Unblock all signals using sigprocmask.
        let sigmask = SigSet::all();
        if let Err(errno) = sigprocmask(SigmaskHow::SIG_UNBLOCK, Some(&sigmask), None) {
            fail_errno_safe!(child, Err::ResetSignal, errno as i32);
        }

        // Reset all signals to their default dispositions.
        if let Err(errno) = crate::reset_signals() {
            fail_errno_safe!(child, Err::ResetSignal, errno as i32);
        }
    }

    // Restriction 1:
    //
    // Apply a Landlock scope sandbox to restrict
    // 1. Ptrace attach outside Landlock.
    // 2. Signal send outside Landlock.
    // We leave path and network restrictions for Landlock
    // to be configured by the user using Lock sandboxing.
    let compat_level = if *HAVE_LANDLOCK_SCOPED_SIGNALS {
        CompatLevel::HardRequirement
    } else {
        CompatLevel::BestEffort
    };
    if let Err(error) = Ruleset::default()
        .set_compatibility(compat_level)
        .scope(Scope::Signal)
        .and_then(|ruleset| ruleset.create())
        .and_then(|ruleset| ruleset.restrict_self(RestrictSelfFlags::empty()))
    {
        if compat_level == CompatLevel::HardRequirement {
            let errno = err2set(&error);
            fail_errno_safe!(child, Err::LandlockFilterScopedSignals, errno as i32);
        }
    }

    // Restriction 2:
    //
    // Add per-architecture seccomp(2) filters to deny unsafe ioctl(2) requests.
    if let Some(denylist) = child.ioctl_denylist.take() {
        if let Err(error) = confine_scmp_ioctl(&denylist, child.cfg.ssb) {
            let errno = error.errno().unwrap_or(Errno::ENOSYS);
            fail_errno_safe!(child, Err::SeccompFilterIoctl, errno as i32);
        }
    }

    // Restriction 3:
    //
    // Deny RWF_NOAPPEND for pwritev2(2) if append-only is enabled.
    if child.cfg.append_only {
        if let Err(error) = confine_scmp_pwritev2(child.cfg.ssb) {
            let errno = error.errno().unwrap_or(Errno::ENOSYS);
            fail_errno_safe!(child, Err::SeccompFilterAppendOnly, errno as i32);
        }
    }

    // Restriction 4:
    //
    // Restrict kernel pointers in syscall arguments unless trace/allow_unsafe_kptr:1 is set.
    if child.cfg.restrict_kptr {
        if let Err(error) = confine_scmp_kptr(child.cfg.ssb) {
            let errno = error.errno().unwrap_or(Errno::ENOSYS);
            fail_errno_safe!(child, Err::SeccompFilterKptr, errno as i32);
        }
    }

    // SAFETY: Do not leak the static file descriptors to the sandbox process.
    proc_close();

    // SAFETY: Do not leak the following FDs to the sandbox process:
    // 1. Log file descriptor.
    // 2. IPC epoll file descriptor.
    // 3. IPC UNIX socket descriptor.
    // TODO: Move this to config.rs.
    const CLOSE_FD_ENVS: &[&str] = &[ENV_LOG_FD, ENV_IPC_POLL_FD, ENV_IPC_UNIX_FD];
    for env in CLOSE_FD_ENVS {
        let fd = if let Some(fd) = env::var_os(env) {
            btoi::<RawFd>(fd.as_bytes()).ok()
        } else {
            None
        };
        if let Some(fd) = fd {
            if fd >= 0 {
                let _ = close(fd);
            }
        }
    }

    // Passthrough RUST_BACKTRACE to the sandbox process.
    match env::var_os("SYD_RUST_BACKTRACE") {
        Some(val) => env::set_var("RUST_BACKTRACE", val),
        None => env::remove_var("RUST_BACKTRACE"),
    }

    // SAFETY: Clean Syd environment variables from process environment.
    // Note, we have just used ENV_LOG_FD above and do not need it anymore.
    for (key, _) in env::vars_os() {
        if is_prefix(key.as_bytes(), b"CARGO_BIN_EXE_syd")
            || (is_prefix(key.as_bytes(), b"SYD_") && !is_prefix(key.as_bytes(), b"SYD_TEST_"))
        {
            env::remove_var(key);
        }
    }

    // We'll write seccomp(2) notify fd to the second pipe, and
    // read the acknowledgement notification from the first pipe.
    let (pipe_ro, pipe_rw) = (child.seccomp_pipefd.0 .0, child.seccomp_pipefd.1 .1);

    // Close the unused ends of the pipes.
    drop(child.seccomp_pipefd.0 .1);
    drop(child.seccomp_pipefd.1 .0);

    if let Some(callback) = &child.pre_exec {
        if let Err(errno) = callback() {
            fail_errno_safe!(child, Err::PreExec, errno as i32);
        }
    }

    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    if child.cfg.deny_tsc {
        if let Err(errno) =
            Errno::result(unsafe { libc::prctl(libc::PR_SET_TSC, libc::PR_TSC_SIGSEGV) })
        {
            fail_errno_safe!(child, Err::SetTSC, errno as i32);
        }
    }

    if child.cfg.restrict_prlimit {
        // Set resource limits before seccomp(2), because it will deny prlimit(2).
        if let Err(errno) = set_resource_limits(&child.cfg) {
            fail_errno_safe!(child, Err::SetResourceLimits, errno as i32);
        }
    }

    if child.cfg.stop {
        // Stop the process to give the parent a chance to seize us and set ptrace options.
        // This must happen _before_ loading the seccomp filter.
        if let Err(errno) = kill(getpid(), Signal::SIGSTOP) {
            fail_errno_safe!(child, Err::ProcessStop, errno as i32);
        }
    }

    if let Some(seccomp_filter) = child.seccomp_filter {
        // Load the seccomp filter.
        if let Err(scmp_err) = seccomp_filter.load() {
            fail_errno_safe!(
                child,
                Err::Seccomp,
                scmp_err
                    .sysrawrc()
                    .map(|errno| errno.abs())
                    .unwrap_or_else(|| Errno::last() as i32)
            );
        }

        // Get seccomp notification fd.
        let seccomp_fd = match seccomp_filter.get_notify_fd() {
            Ok(fd) => {
                // SAFETY: get_notify_fd returns a valid FD.
                unsafe { OwnedFd::from_raw_fd(fd) }
            }
            Err(scmp_err) => fail_errno_safe!(
                child,
                Err::Seccomp,
                scmp_err
                    .sysrawrc()
                    .map(|errno| errno.abs())
                    .unwrap_or_else(|| Errno::last() as i32)
            ),
        };

        // Write the value of the seccomp notify fd to the pipe.
        // Handle partial writes and interrupts.
        // EOF means parent died before reading.
        let fd = seccomp_fd.as_raw_fd().to_le_bytes();
        let mut nwrite = 0;
        while nwrite < fd.len() {
            #[expect(clippy::arithmetic_side_effects)]
            match retry_on_eintr(|| write(&pipe_rw, &fd[nwrite..])) {
                Ok(0) => {
                    // Parent died before reading.
                    // This should ideally never happen.
                    fail_errno_safe!(child, Err::SeccompSendFd, Errno::EIO as i32);
                }
                Ok(n) => nwrite += n,
                Err(errno) => fail_errno_safe!(child, Err::SeccompSendFd, errno as i32),
            }
        }

        // Close the write end of the pipe.
        drop(pipe_rw);

        // Wait for the parent to get the file descriptor.
        // Handle interrupts.
        // Partial read is not possible.
        // EOF means parent died before writing to the pipe.
        let mut buf = [0u8; 1];
        match retry_on_eintr(|| read(&pipe_ro, &mut buf[..])) {
            Ok(0) => {
                // Parent died before writing.
                // This should ideally never happen.
                fail_errno_safe!(child, Err::SeccompWaitFd, Errno::EIO as i32);
            }
            Ok(1) if buf[0] == 42 => {
                // Parent received seccomp fd successfully.
                // We can go ahead and close our copy now.
            }
            Ok(_) => unreachable!("BUG: The meaning of life is not {:#x}!", buf[0]),
            Err(errno) => fail_errno_safe!(child, Err::SeccompWaitFd, errno as i32),
        }

        // Close our copy of the seccomp-notify fd.
        // Parent process has already acknowledged that
        // it has received a copy of this fd.
        drop(seccomp_fd);

        // Release resources for seccomp BPF filter.
        // Memory allocation/deallocation is OK here
        // now that we have transferred over the
        // seccomp-notify fd to the parent process.
        // Otherwise we'd risk breaking Memory sandboxing.
        drop(seccomp_filter);

        // Close the read end of the pipe.
        drop(pipe_ro);
    } else {
        // Close unused ends of the pipes.
        drop(pipe_ro);
        drop(pipe_rw);
    }

    // SAFETY: Drop the following capabilities unconditionally.
    // 1. CAP_CHOWN: for privileged chown(2)
    // 2. CAP_MKNOD: for privileged mknod(2)
    // 3. CAP_NET_BIND_SERVICE: for privileged bind(2)
    // 4. CAP_NET_RAW: for privileged socket(2)
    // These system calls happen in syd-emulator threads
    // even if the respective unsafe options are set,
    // therefore dropping the caps here ensures this.
    const CAP_DROP: &[Capability] = &[
        Capability::CAP_CHOWN,
        Capability::CAP_MKNOD,
        Capability::CAP_NET_BIND_SERVICE,
        Capability::CAP_NET_RAW,
    ];
    for cap in CAP_DROP {
        if safe_drop_cap(*cap).is_err() {
            fail_safe!(child, Err::CapSet);
        }
    }

    // Drop CAP_SYS_PTRACE late as Syd may need it.
    if !child.cfg.keep && safe_drop_cap(Capability::CAP_SYS_PTRACE).is_err() {
        fail_safe!(child, Err::CapSet);
    }

    if !child.cfg.restrict_prlimit {
        // Set resource limits after seccomp(2) with trace/allow_unsafe_prlimit:1.
        if let Err(errno) = set_resource_limits(&child.cfg) {
            fail_errno_safe!(child, Err::SetResourceLimits, errno as i32);
        }
    }

    // Exit immediately if export mode is in effect.
    if env::var_os(ENV_DUMP_SCMP).is_some() {
        unsafe { libc::_exit(0) };
    }

    let Err(errno) = execvp(&child.exe_file, &child.exe_args);
    fail_errno_safe!(child, Err::Exec, errno as i32);
}

#[expect(clippy::cognitive_complexity)]
fn set_resource_limits(cfg: &Config) -> Result<(), Errno> {
    if let Some(lim) = cfg.rlimit_as {
        setrlimit(Resource::RLIMIT_AS, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_core {
        setrlimit(Resource::RLIMIT_CORE, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_cpu {
        setrlimit(Resource::RLIMIT_CPU, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_data {
        setrlimit(Resource::RLIMIT_DATA, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_fsize {
        setrlimit(Resource::RLIMIT_FSIZE, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_memlock {
        setrlimit(Resource::RLIMIT_MEMLOCK, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_msgqueue {
        setrlimit(Resource::RLIMIT_MSGQUEUE, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_nice {
        setrlimit(Resource::RLIMIT_NICE, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_nofile {
        setrlimit(Resource::RLIMIT_NOFILE, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_nproc {
        setrlimit(Resource::RLIMIT_NPROC, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_rtprio {
        setrlimit(Resource::RLIMIT_RTPRIO, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_rttime {
        setrlimit(Resource::RLIMIT_RTTIME, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_sigpending {
        setrlimit(Resource::RLIMIT_RTTIME, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_stack {
        setrlimit(Resource::RLIMIT_STACK, lim, lim)?;
    }
    Ok(())
}
