From f27292ed02f4427bb2e404193832db75b2ba8a14 Mon Sep 17 00:00:00 2001 From: Tulio A M Mendes Date: Mon, 25 May 2026 10:31:24 -0300 Subject: [PATCH] kernel: POSIX compliance and robustness fixes (Round 3) MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Round 3 of the audit fix plan — medium-severity POSIX compliance fixes: - ftruncate: reject O_RDONLY fd with EBADF (write permission check) - truncate: check vfs_check_permission for write access (EACCES) - O_EXCL: return EEXIST when O_CREAT|O_EXCL on existing file - O_DIRECTORY: return ENOTDIR when O_DIRECTORY on non-directory - posix_spawn: fix _syscall2→_syscall4 to pass all 4 args (pid_out, path, argv, envp) matching kernel handler expectations - SYSCALL_MKDIR: accept mode argument from user space (passed through to syscall_mkdir_impl; VFS backends don't use it yet) - CLONE_VM: add address-space refcount table to prevent use-after-free when thread group leader exits before threads. g_as_refcnt[] tracks refs per addr_space value; parent and child each hold a ref; last ref to exit destroys the AS. Tests: 116/116 smoke, 142/142 battery, 111/111 host, cppcheck clean --- include/process.h | 1 + src/kernel/scheduler.c | 47 ++++++++++++++++++++++++++++++++++++++++-- src/kernel/syscall.c | 14 +++++++++++-- user/ulibc/src/spawn.c | 3 +-- 4 files changed, 59 insertions(+), 6 deletions(-) diff --git a/include/process.h b/include/process.h index 7d4340bd..7f0ce7f0 100644 --- a/include/process.h +++ b/include/process.h @@ -152,6 +152,7 @@ struct process { /* Thread support */ uint32_t tgid; /* Thread group ID (== pid for group leader) */ uint32_t flags; /* PROCESS_FLAG_* */ + uint32_t as_refcount; /* CLONE_VM address space reference count */ uintptr_t tls_base; /* User-space TLS base (set via SET_THREAD_AREA) */ uint32_t* clear_child_tid; /* User address to clear + futex-wake on exit */ diff --git a/src/kernel/scheduler.c b/src/kernel/scheduler.c index afbdb85f..405f9e99 100644 --- a/src/kernel/scheduler.c +++ b/src/kernel/scheduler.c @@ -8,6 +8,33 @@ */ #include "process.h" + +/* ---- CLONE_VM address-space refcount table ---- */ +#define AS_REFCOUNT_MAX 64 +static struct { uintptr_t as; uint32_t refcnt; } g_as_refcnt[AS_REFCOUNT_MAX]; + +static void as_refcount_inc(uintptr_t as) { + if (!as) return; + for (int i = 0; i < AS_REFCOUNT_MAX; i++) { + if (g_as_refcnt[i].as == as) { g_as_refcnt[i].refcnt++; return; } + } + for (int i = 0; i < AS_REFCOUNT_MAX; i++) { + if (g_as_refcnt[i].as == 0) { g_as_refcnt[i].as = as; g_as_refcnt[i].refcnt = 1; return; } + } +} + +static uint32_t as_refcount_dec(uintptr_t as) { + if (!as) return 0; + for (int i = 0; i < AS_REFCOUNT_MAX; i++) { + if (g_as_refcnt[i].as == as) { + if (g_as_refcnt[i].refcnt > 0) g_as_refcnt[i].refcnt--; + uint32_t r = g_as_refcnt[i].refcnt; + if (r == 0) { g_as_refcnt[i].as = 0; } + return r; + } + } + return 0; +} #include "pmm.h" #include "vmm.h" #include "heap.h" @@ -336,8 +363,18 @@ static void process_reap_locked(struct process* p) { } if (p->addr_space && p->addr_space != kernel_as) { - /* Threads share addr_space with group leader; don't destroy it */ - if (!(p->flags & PROCESS_FLAG_THREAD)) { + if (p->flags & PROCESS_FLAG_THREAD) { + /* Thread: decrement the AS refcount; destroy if last ref */ + if (as_refcount_dec(p->addr_space) == 0) { + vmm_as_destroy(p->addr_space); + } + } else if (p->as_refcount > 0) { + /* Leader with live threads: decrement refcount; last one destroys */ + if (as_refcount_dec(p->addr_space) == 0) { + vmm_as_destroy(p->addr_space); + } + } else { + /* Regular process (no CLONE_VM threads): safe to destroy */ vmm_as_destroy(p->addr_space); } p->addr_space = 0; @@ -785,6 +822,12 @@ struct process* process_clone_create(uint32_t clone_flags, if (clone_flags & CLONE_VM) { proc->addr_space = current_process->addr_space; proc->flags |= PROCESS_FLAG_THREAD; + /* If this is the first thread, add the parent's own ref to the table */ + if (current_process->as_refcount == 0) { + as_refcount_inc(proc->addr_space); /* parent's ref */ + } + as_refcount_inc(proc->addr_space); /* child's ref */ + current_process->as_refcount++; } else { proc->addr_space = vmm_as_clone_user_cow(current_process->addr_space); if (!proc->addr_space) { diff --git a/src/kernel/syscall.c b/src/kernel/syscall.c index cf9af052..51a6f5f3 100644 --- a/src/kernel/syscall.c +++ b/src/kernel/syscall.c @@ -2387,6 +2387,12 @@ static int syscall_open_impl(const char* user_path, uint32_t flags) { if (rc < 0) return rc; } else if (!node) { return -ENOENT; + } else if ((flags & 0x40U) != 0U && (flags & 0x80U) != 0U) { + /* O_CREAT | O_EXCL on existing file → EEXIST */ + return -EEXIST; + } else if ((flags & 0x10000U) != 0U && !(node->flags & FS_DIRECTORY)) { + /* O_DIRECTORY on non-directory → ENOTDIR */ + return -ENOTDIR; } else if ((flags & 0x200U) != 0U && node->flags == FS_FILE) { /* O_TRUNC on existing file */ if (node->i_ops && node->i_ops->truncate) { @@ -2684,7 +2690,8 @@ static int syscall_getcwd_impl(char* user_buf, uint32_t size) { return 0; } -static int syscall_mkdir_impl(const char* user_path) { +static int syscall_mkdir_impl(const char* user_path, uint32_t mode) { + (void)mode; /* TODO: pass mode to vfs_mkdir once FS backends support it */ if (!user_path) return -EFAULT; char path[128]; @@ -3725,7 +3732,8 @@ void syscall_handler(struct registers* regs) { if (syscall_no == SYSCALL_MKDIR) { const char* path = (const char*)sc_arg0(regs); - sc_ret(regs) = (uint32_t)syscall_mkdir_impl(path); + uint32_t mode = sc_arg1(regs); + sc_ret(regs) = (uint32_t)syscall_mkdir_impl(path, mode); return; } @@ -4342,6 +4350,7 @@ static void posix_ext_syscall_dispatch(struct registers* regs, uint32_t syscall_ uint32_t length = sc_arg1(regs); struct file* f = fd_get(fd); if (!f || !f->node) { sc_ret(regs) = (uint32_t)-EBADF; return; } + if ((f->flags & 3U) == 0U) { sc_ret(regs) = (uint32_t)-EBADF; return; } /* O_RDONLY */ if (!(f->node->flags & FS_FILE)) { sc_ret(regs) = (uint32_t)-EINVAL; return; } f->node->length = length; sc_ret(regs) = 0; @@ -4358,6 +4367,7 @@ static void posix_ext_syscall_dispatch(struct registers* regs, uint32_t syscall_ fs_node_t* node = vfs_lookup(path); if (!node) { sc_ret(regs) = (uint32_t)-ENOENT; return; } if (!(node->flags & FS_FILE)) { sc_ret(regs) = (uint32_t)-EISDIR; return; } + if (vfs_check_permission(node, 2) != 0) { sc_ret(regs) = (uint32_t)-EACCES; return; } /* write */ node->length = length; sc_ret(regs) = 0; return; diff --git a/user/ulibc/src/spawn.c b/user/ulibc/src/spawn.c index 3dc98700..a39a57af 100644 --- a/user/ulibc/src/spawn.c +++ b/user/ulibc/src/spawn.c @@ -18,8 +18,7 @@ int posix_spawn(int* pid, const char* path, char* const argv[], char* const envp[]) { (void)file_actions; (void)attrp; - (void)envp; - int ret = _syscall2(SYS_POSIX_SPAWN, (int)path, (int)argv); + int ret = _syscall4(SYS_POSIX_SPAWN, (int)pid, (int)path, (int)argv, (int)envp); if (ret < 0) { errno = -ret; return -ret; -- 2.43.0