/* Per-CPU data block — one per CPU, accessed via GS segment.
* The GS base for each CPU points to its own percpu_data instance. */
struct percpu_data {
- uint32_t cpu_index; /* 0 = BSP */
- uint32_t lapic_id;
- struct process* current_process; /* Currently running process on this CPU */
- uintptr_t kernel_stack; /* Top of this CPU's kernel stack */
- uint32_t nested_irq; /* IRQ nesting depth */
- uint32_t rq_load; /* Number of READY processes on this CPU */
- uint32_t reserved[2]; /* Padding to 32 bytes */
+ struct percpu_data* self; /* offset 0 — self-pointer for percpu_get() */
+ uint32_t cpu_index; /* offset 4, 0 = BSP */
+ uint32_t lapic_id; /* offset 8 */
+ struct process* current_process; /* offset 12 */
+ uintptr_t kernel_stack; /* offset 16 */
+ uint32_t nested_irq; /* offset 20 */
+ uint32_t rq_load; /* offset 24 */
+ int uaccess_active; /* offset 28 */
+ int uaccess_faulted; /* offset 32 */
+ uintptr_t uaccess_recover; /* offset 36 */
};
/* Initialize per-CPU data for all CPUs. Called once from BSP after SMP init. */
/* Get current CPU index (fast path via GS). */
static inline uint32_t percpu_cpu_index(void) {
uint32_t idx;
- __asm__ volatile("mov %%gs:0, %0" : "=r"(idx));
+ __asm__ volatile("mov %%gs:4, %0" : "=r"(idx));
return idx;
}
/* Get current process on this CPU (fast path via GS). */
static inline struct process* percpu_current(void) {
struct process* p;
- __asm__ volatile("mov %%gs:8, %0" : "=r"(p));
+ __asm__ volatile("mov %%gs:12, %0" : "=r"(p));
return p;
}
/* Set current process on this CPU. */
static inline void percpu_set_current(struct process* proc) {
- __asm__ volatile("mov %0, %%gs:8" : : "r"(proc) : "memory");
+ __asm__ volatile("mov %0, %%gs:12" : : "r"(proc) : "memory");
}
#endif
if (ph[i].p_type != PT_LOAD) continue;
if (ph[i].p_memsz == 0) continue;
+ /* Detect 32-bit overflow in p_vaddr + base_offset */
+ if (base_offset != 0 && (uintptr_t)ph[i].p_vaddr > (UINT32_MAX - base_offset))
+ return -EINVAL;
+
uintptr_t vaddr = (uintptr_t)ph[i].p_vaddr + base_offset;
if (vaddr == 0 || vaddr >= hal_mm_kernel_virt_base()) return -EINVAL;
+ /* Detect 32-bit overflow in vaddr + p_memsz */
+ if (ph[i].p_memsz > (UINT32_MAX - (uint32_t)vaddr))
+ return -EINVAL;
+
uint32_t seg_end = (uint32_t)vaddr + ph[i].p_memsz;
if (seg_end < vaddr || seg_end >= hal_mm_kernel_virt_base()) return -EINVAL;
for (uint32_t i = 0; i < ncpus; i++) {
const struct cpu_info* ci = smp_get_cpu(i);
+ g_percpu[i].self = &g_percpu[i];
g_percpu[i].cpu_index = i;
g_percpu[i].lapic_id = ci ? ci->lapic_id : 0;
g_percpu[i].current_process = NULL;
#include "errno.h"
#include "interrupts.h"
#include "hal/mm.h"
+#include "arch/x86/percpu.h"
#include <stdint.h>
return 1;
}
-static volatile int g_uaccess_active = 0;
-static volatile int g_uaccess_faulted = 0;
-static volatile uintptr_t g_uaccess_recover_eip = 0;
-
int uaccess_try_recover(uintptr_t fault_addr, struct registers* regs) {
if (!regs) return 0;
- if (g_uaccess_active == 0) return 0;
- if (g_uaccess_recover_eip == 0) return 0;
+
+ struct percpu_data* pc = percpu_get();
+ if (!pc->uaccess_active) return 0;
+ if (!pc->uaccess_recover) return 0;
// Only recover faults on user addresses; kernel faults should still panic.
if (fault_addr >= hal_mm_kernel_virt_base()) return 0;
- g_uaccess_faulted = 1;
- regs->eip = (uint32_t)g_uaccess_recover_eip;
+ pc->uaccess_faulted = 1;
+ regs->eip = (uint32_t)pc->uaccess_recover;
return 1;
}
if (len == 0) return 0;
if (!user_range_ok(src_user, len)) return -EFAULT;
- g_uaccess_faulted = 0;
- g_uaccess_recover_eip = (uintptr_t)&&uaccess_fault;
- g_uaccess_active = 1;
+ struct percpu_data* pc = percpu_get();
+ pc->uaccess_faulted = 0;
+ pc->uaccess_recover = (uintptr_t)&&uaccess_fault;
+ pc->uaccess_active = 1;
stac();
uintptr_t up = (uintptr_t)src_user;
}
clac();
- g_uaccess_active = 0;
- g_uaccess_recover_eip = 0;
- if (g_uaccess_faulted) return -EFAULT;
+ pc->uaccess_active = 0;
+ pc->uaccess_recover = 0;
+ if (pc->uaccess_faulted) return -EFAULT;
return 0;
uaccess_fault:
clac();
- g_uaccess_active = 0;
- g_uaccess_faulted = 0;
- g_uaccess_recover_eip = 0;
+ pc->uaccess_active = 0;
+ pc->uaccess_faulted = 0;
+ pc->uaccess_recover = 0;
return -EFAULT;
}
if (!x86_user_range_writable_user((uintptr_t)dst_user, len)) return -EFAULT;
- g_uaccess_faulted = 0;
- g_uaccess_recover_eip = (uintptr_t)&&uaccess_fault2;
- g_uaccess_active = 1;
+ struct percpu_data* pc = percpu_get();
+ pc->uaccess_faulted = 0;
+ pc->uaccess_recover = (uintptr_t)&&uaccess_fault2;
+ pc->uaccess_active = 1;
stac();
uintptr_t up = (uintptr_t)dst_user;
}
clac();
- g_uaccess_active = 0;
- g_uaccess_recover_eip = 0;
- if (g_uaccess_faulted) return -EFAULT;
+ pc->uaccess_active = 0;
+ pc->uaccess_recover = 0;
+ if (pc->uaccess_faulted) return -EFAULT;
return 0;
uaccess_fault2:
clac();
- g_uaccess_active = 0;
- g_uaccess_faulted = 0;
- g_uaccess_recover_eip = 0;
+ pc->uaccess_active = 0;
+ pc->uaccess_faulted = 0;
+ pc->uaccess_recover = 0;
return -EFAULT;
}
/* MMIO helpers */
/* ------------------------------------------------------------------ */
+/* Compiler barrier — prevents reordering across MMIO accesses.
+ * On x86 with UC-mapped MMIO, CPU ordering is already strict;
+ * the barrier ensures the compiler respects that ordering too. */
+#define mmio_barrier() __asm__ volatile("" ::: "memory")
+
static inline uint32_t e1000_read(uint32_t reg) {
- return e1000_mmio[reg / 4];
+ uint32_t val = e1000_mmio[reg / 4];
+ mmio_barrier();
+ return val;
}
static inline void e1000_write(uint32_t reg, uint32_t val) {
e1000_mmio[reg / 4] = val;
+ mmio_barrier();
}
/* ------------------------------------------------------------------ */
if (c == -1) break;
e.d_ino = (uint32_t)c;
e.d_type = (uint8_t)entries[c].flags;
- strcpy(e.d_name, entries[c].name);
+ strncpy(e.d_name, entries[c].name, sizeof(e.d_name) - 1);
+ e.d_name[sizeof(e.d_name) - 1] = '\0';
}
e.d_reclen = (uint16_t)sizeof(e);
if (di < NBUILTINS) {
e.d_ino = builtins[di].ino;
e.d_type = builtins[di].type;
- strcpy(e.d_name, builtins[di].name);
+ strncpy(e.d_name, builtins[di].name, sizeof(e.d_name) - 1);
+ e.d_name[sizeof(e.d_name) - 1] = '\0';
} else {
fs_node_t* rn = g_registered[di - NBUILTINS];
e.d_ino = rn->inode;
e.d_type = (uint8_t)rn->flags;
- strcpy(e.d_name, rn->name);
+ strncpy(e.d_name, rn->name, sizeof(e.d_name) - 1);
+ e.d_name[sizeof(e.d_name) - 1] = '\0';
}
}
if (!c) return NULL;
memset(c, 0, sizeof(*c));
- strcpy(c->vfs.name, name);
+ strncpy(c->vfs.name, name, sizeof(c->vfs.name) - 1);
+ c->vfs.name[sizeof(c->vfs.name) - 1] = '\0';
c->ofs = parent->ofs;
c->lower = lower_child;
c->upper = upper_child;
static uint32_t kstack_next_slot = 0;
static spinlock_t kstack_lock = {0};
+/* Free-slot recycling stack: freed slot indices are pushed here and
+ * reused before bumping kstack_next_slot. */
+#define KSTACK_FREE_MAX 256
+static uint32_t kstack_free_stack[KSTACK_FREE_MAX];
+static uint32_t kstack_free_top = 0;
+
static void* kstack_alloc(void) {
uintptr_t flags = spin_lock_irqsave(&kstack_lock);
- if (kstack_next_slot >= KSTACK_MAX) {
+
+ uint32_t slot;
+ if (kstack_free_top > 0) {
+ slot = kstack_free_stack[--kstack_free_top];
+ } else if (kstack_next_slot < KSTACK_MAX) {
+ slot = kstack_next_slot++;
+ } else {
spin_unlock_irqrestore(&kstack_lock, flags);
+ kprintf("[SCHED] BUG: kernel stack slots exhausted!\n");
return NULL;
}
- uint32_t slot = kstack_next_slot++;
+
spin_unlock_irqrestore(&kstack_lock, flags);
uintptr_t base = KSTACK_REGION + slot * KSTACK_SLOT;
return;
for (uint32_t i = 0; i < KSTACK_PAGES; i++)
vmm_unmap_page((uint64_t)(addr + i * 0x1000U));
- /* Note: slot is not recycled — acceptable for now */
+
+ /* Recycle the slot index */
+ uint32_t slot = (uint32_t)((addr - 0x1000U - KSTACK_REGION) / KSTACK_SLOT);
+ uintptr_t flags = spin_lock_irqsave(&kstack_lock);
+ if (kstack_free_top < KSTACK_FREE_MAX) {
+ kstack_free_stack[kstack_free_top++] = slot;
+ }
+ spin_unlock_irqrestore(&kstack_lock, flags);
}
/* ---------- O(1) runqueue ---------- */
} else {
// Nothing in runqueues.
if (prev->state == PROCESS_READY) {
- // prev was just enqueued to expired — pull it back.
- rq_dequeue(crq->expired, prev);
+ // prev was just enqueued before rq_pick_next swapped active/expired.
+ // After the swap, prev is in crq->active (the old expired).
+ rq_dequeue(crq->active, prev);
next = prev;
} else {
// Fall back to this CPU's idle process.
}
void ksem_wait(ksem_t* s) {
- (void)ksem_wait_timeout(s, 0);
+ while (ksem_wait_timeout(s, 0) != 0) {
+ /* Waiters array full — yield and retry */
+ schedule();
+ }
}
int ksem_wait_timeout(ksem_t* s, uint32_t timeout_ms) {
if (mqd < 0 || mqd >= MQ_MAX_QUEUES) return -EBADF;
if (len > MQ_MSG_SIZE) return -EMSGSIZE;
+ /* Copy user data into a kernel-side buffer before taking the lock
+ * to avoid TOCTOU: another thread could race on the same slot. */
+ uint8_t kbuf[MQ_MSG_SIZE];
+ if (copy_from_user(kbuf, user_buf, len) < 0) return -EFAULT;
+
uintptr_t fl = spin_lock_irqsave(&mq_lock);
struct mq_queue* q = &mq_table[mqd];
if (!q->active) { spin_unlock_irqrestore(&mq_lock, fl); return -EBADF; }
if (q->count >= q->maxmsg) { spin_unlock_irqrestore(&mq_lock, fl); return -EAGAIN; }
struct mq_msg* m = &q->msgs[q->tail];
- spin_unlock_irqrestore(&mq_lock, fl);
-
- if (copy_from_user(m->data, user_buf, len) < 0) return -EFAULT;
+ memcpy(m->data, kbuf, len);
m->len = len;
m->prio = prio;
-
- fl = spin_lock_irqsave(&mq_lock);
q->tail = (q->tail + 1) % q->maxmsg;
q->count++;
spin_unlock_irqrestore(&mq_lock, fl);
if (p_type != 1) continue; /* PT_LOAD = 1 */
if (p_memsz == 0) continue;
+ /* Detect 32-bit overflow in p_vaddr + base */
+ if (p_vaddr > (UINT32_MAX - base)) continue;
uint32_t vaddr = p_vaddr + base;
if (vaddr >= 0xC0000000U) continue;
+ /* Detect 32-bit overflow in vaddr + p_memsz */
+ if (p_memsz > (UINT32_MAX - vaddr)) continue;
+
/* Map pages */
uint32_t start_page = vaddr & ~0xFFFU;
uint32_t end_page = (vaddr + p_memsz - 1) & ~0xFFFU;
memset(n, 0, sizeof(*n));
if (name) {
- strcpy(n->vfs.name, name);
+ strncpy(n->vfs.name, name, sizeof(n->vfs.name) - 1);
+ n->vfs.name[sizeof(n->vfs.name) - 1] = '\0';
} else {
n->vfs.name[0] = 0;
}
if (!c) break;
e.d_ino = c->vfs.inode;
e.d_type = (uint8_t)c->vfs.flags;
- strcpy(e.d_name, c->vfs.name);
+ strncpy(e.d_name, c->vfs.name, sizeof(e.d_name) - 1);
+ e.d_name[sizeof(e.d_name) - 1] = '\0';
}
e.d_reclen = (uint16_t)sizeof(e);
struct tmpfs_node* ln = tmpfs_node_alloc(leaf, FS_SYMLINK);
if (!ln) return -ENOMEM;
- strcpy(ln->vfs.symlink_target, target);
+ strncpy(ln->vfs.symlink_target, target, sizeof(ln->vfs.symlink_target) - 1);
+ ln->vfs.symlink_target[sizeof(ln->vfs.symlink_target) - 1] = '\0';
ln->vfs.length = (uint32_t)strlen(target);
/* symlinks have no f_ops */
return n;
}
-/* Buddy address via XOR on the offset from heap start */
+/* Buddy address via XOR on the offset from heap start.
+ * Returns NULL if the result falls outside the heap — defence-in-depth
+ * against corrupted order fields. */
static inline block_hdr_t* buddy_of(block_hdr_t* b, int order) {
uintptr_t off = (uintptr_t)b - KHEAP_START;
- return (block_hdr_t*)(KHEAP_START + (off ^ (1U << order)));
+ uintptr_t buddy_off = off ^ (1U << order);
+ if (buddy_off >= BUDDY_HEAP_SIZE)
+ return NULL;
+ return (block_hdr_t*)(KHEAP_START + buddy_off);
}
/* Minimum order that can hold `size` user bytes (+ header) */
/* Coalesce with buddy while possible */
while (order < BUDDY_MAX_ORDER) {
block_hdr_t* buddy = buddy_of(blk, order);
+ if (!buddy) break;
/* Buddy must be valid, free, and at the same order */
if (buddy->magic != BUDDY_MAGIC || !buddy->is_free ||
uintptr_t flags = spin_lock_irqsave(&pmm_lock);
uint16_t rc = frame_refcount[frame];
+
+ if (rc == 0 || !bitmap_test(frame)) {
+ spin_unlock_irqrestore(&pmm_lock, flags);
+ kprintf("[PMM] BUG: double free of frame %u (rc=%u)\n",
+ (unsigned)frame, (unsigned)rc);
+ return;
+ }
+
if (rc > 1) {
frame_refcount[frame]--;
spin_unlock_irqrestore(&pmm_lock, flags);