#include "process.h"
#include "pmm.h"
#include "vmm.h"
#include "heap.h"
#include "console.h"
#include "timer.h" // Need access to current tick usually, but we pass it in wake_check
#include "spinlock.h"
#include "utils.h"
#include "errno.h"
#include "hal/cpu.h"
#include "hal/usermode.h"
#include "arch_process.h"
#include "arch_fpu.h"
#include "sched_pcpu.h"
#include <stddef.h>
#ifndef __i386__
struct process* current_process = NULL;
#endif
struct process* ready_queue_head = NULL;
struct process* ready_queue_tail = NULL;
static uint32_t next_pid = 2; /* PID 1 reserved for init */
static uint32_t init_pid = 0; /* PID of the first userspace process ("init") */
spinlock_t sched_lock = {0};
static uintptr_t kernel_as = 0;
/*
* Kernel stack allocator with guard pages.
* Layout per slot: [guard page (unmapped)] [2 stack pages (mapped)]
* Virtual region: 0xC8000000 .. 0xCFFFFFFF (128MB, up to 10922 stacks)
*/
#define KSTACK_REGION 0xC8000000U
#define KSTACK_PAGES 2 /* 8KB usable stack per thread */
#define KSTACK_SIZE (KSTACK_PAGES * 0x1000U)
#define KSTACK_SLOT (0x1000U + KSTACK_SIZE) /* guard + stack */
#define KSTACK_MAX 10922
static uint32_t kstack_next_slot = 0;
static spinlock_t kstack_lock = {0};
static void* kstack_alloc(void) {
uintptr_t flags = spin_lock_irqsave(&kstack_lock);
if (kstack_next_slot >= KSTACK_MAX) {
spin_unlock_irqrestore(&kstack_lock, flags);
return NULL;
}
uint32_t slot = kstack_next_slot++;
spin_unlock_irqrestore(&kstack_lock, flags);
uintptr_t base = KSTACK_REGION + slot * KSTACK_SLOT;
/* base+0x0000 = guard page (leave unmapped) */
/* base+0x1000 .. base+0x1000+KSTACK_SIZE = actual stack pages */
for (uint32_t i = 0; i < KSTACK_PAGES; i++) {
void* phys = pmm_alloc_page();
if (!phys) return NULL;
vmm_map_page((uint64_t)(uintptr_t)phys,
(uint64_t)(base + 0x1000U + i * 0x1000U),
VMM_FLAG_PRESENT | VMM_FLAG_RW);
}
memset((void*)(base + 0x1000U), 0, KSTACK_SIZE);
return (void*)(base + 0x1000U);
}
static void kstack_free(void* stack) {
if (!stack) return;
uintptr_t addr = (uintptr_t)stack;
if (addr < KSTACK_REGION || addr >= KSTACK_REGION + KSTACK_MAX * KSTACK_SLOT)
return;
for (uint32_t i = 0; i < KSTACK_PAGES; i++)
vmm_unmap_page((uint64_t)(addr + i * 0x1000U));
/* Note: slot is not recycled — acceptable for now */
}
/* ---------- O(1) runqueue ---------- */
struct prio_queue {
struct process* head;
struct process* tail;
};
struct runqueue {
uint32_t bitmap; // bit i set => queue[i] non-empty
struct prio_queue queue[SCHED_NUM_PRIOS];
};
/* Per-CPU runqueue: each CPU has its own active/expired pair + idle process */
struct cpu_rq {
struct runqueue stores[2];
struct runqueue *active;
struct runqueue *expired;
struct process *idle; /* per-CPU idle (PID 0 on BSP) */
};
#ifdef __i386__
#include "arch/x86/smp.h"
#define SCHED_MAX_CPUS SMP_MAX_CPUS
#else
#define SCHED_MAX_CPUS 1
#endif
static struct cpu_rq pcpu_rq[SCHED_MAX_CPUS];
#ifdef __i386__
#include "arch/x86/lapic.h"
#include "arch/x86/percpu.h"
/* Send IPI reschedule to wake a remote CPU when work arrives */
static void sched_ipi_resched(uint32_t target_cpu) {
uint32_t my_cpu = percpu_cpu_index();
if (target_cpu == my_cpu) return;
if (!lapic_is_enabled()) return;
extern const struct cpu_info* smp_get_cpu(uint32_t index);
const struct cpu_info* ci = smp_get_cpu(target_cpu);
if (!ci) return;
lapic_send_ipi(ci->lapic_id, IPI_RESCHED_VEC);
}
#else
static void sched_ipi_resched(uint32_t target_cpu) { (void)target_cpu; }
#endif
static inline uint32_t bsf32(uint32_t v) {
return (uint32_t)__builtin_ctz(v);
}
static void rq_enqueue(struct runqueue* rq, struct process* p) {
uint8_t prio = p->priority;
struct prio_queue* pq = &rq->queue[prio];
p->rq_next = NULL;
p->rq_prev = pq->tail;
if (pq->tail) pq->tail->rq_next = p;
else pq->head = p;
pq->tail = p;
rq->bitmap |= (1U << prio);
}
static void rq_dequeue(struct runqueue* rq, struct process* p) {
uint8_t prio = p->priority;
struct prio_queue* pq = &rq->queue[prio];
if (p->rq_prev) p->rq_prev->rq_next = p->rq_next;
else pq->head = p->rq_next;
if (p->rq_next) p->rq_next->rq_prev = p->rq_prev;
else pq->tail = p->rq_prev;
p->rq_next = NULL;
p->rq_prev = NULL;
if (!pq->head) rq->bitmap &= ~(1U << prio);
}
static void rq_remove_if_queued(struct process* p) {
uint8_t prio = p->priority;
struct process* it;
struct cpu_rq *crq = &pcpu_rq[p->cpu_id < SCHED_MAX_CPUS ? p->cpu_id : 0];
it = crq->active->queue[prio].head;
while (it) {
if (it == p) { rq_dequeue(crq->active, p); return; }
it = it->rq_next;
}
it = crq->expired->queue[prio].head;
while (it) {
if (it == p) { rq_dequeue(crq->expired, p); return; }
it = it->rq_next;
}
}
/* ---------- Sorted sleep queue (by wake_at_tick) ---------- */
static struct process* sleep_head = NULL;
static void sleep_queue_insert(struct process* p) {
p->in_sleep_queue = 1;
if (!sleep_head || p->wake_at_tick <= sleep_head->wake_at_tick) {
p->sleep_prev = NULL;
p->sleep_next = sleep_head;
if (sleep_head) sleep_head->sleep_prev = p;
sleep_head = p;
return;
}
struct process* cur = sleep_head;
while (cur->sleep_next && cur->sleep_next->wake_at_tick < p->wake_at_tick)
cur = cur->sleep_next;
p->sleep_next = cur->sleep_next;
p->sleep_prev = cur;
if (cur->sleep_next) cur->sleep_next->sleep_prev = p;
cur->sleep_next = p;
}
static void sleep_queue_remove(struct process* p) {
if (!p->in_sleep_queue) return;
if (p->sleep_prev) p->sleep_prev->sleep_next = p->sleep_next;
else sleep_head = p->sleep_next;
if (p->sleep_next) p->sleep_next->sleep_prev = p->sleep_prev;
p->sleep_prev = NULL;
p->sleep_next = NULL;
p->in_sleep_queue = 0;
}
/* ---------- Sorted alarm queue (by alarm_tick) ---------- */
static struct process* alarm_head = NULL;
static void alarm_queue_insert(struct process* p) {
p->in_alarm_queue = 1;
if (!alarm_head || p->alarm_tick <= alarm_head->alarm_tick) {
p->alarm_prev = NULL;
p->alarm_next = alarm_head;
if (alarm_head) alarm_head->alarm_prev = p;
alarm_head = p;
return;
}
struct process* cur = alarm_head;
while (cur->alarm_next && cur->alarm_next->alarm_tick < p->alarm_tick)
cur = cur->alarm_next;
p->alarm_next = cur->alarm_next;
p->alarm_prev = cur;
if (cur->alarm_next) cur->alarm_next->alarm_prev = p;
cur->alarm_next = p;
}
static void alarm_queue_remove(struct process* p) {
if (!p->in_alarm_queue) return;
if (p->alarm_prev) p->alarm_prev->alarm_next = p->alarm_next;
else alarm_head = p->alarm_next;
if (p->alarm_next) p->alarm_next->alarm_prev = p->alarm_prev;
p->alarm_prev = NULL;
p->alarm_next = NULL;
p->in_alarm_queue = 0;
}
static struct process* rq_pick_next(uint32_t cpu) {
struct cpu_rq *crq = &pcpu_rq[cpu];
if (crq->active->bitmap) {
uint32_t prio = bsf32(crq->active->bitmap);
return crq->active->queue[prio].head;
}
// Swap active <-> expired
struct runqueue* tmp = crq->active;
crq->active = crq->expired;
crq->expired = tmp;
if (crq->active->bitmap) {
uint32_t prio = bsf32(crq->active->bitmap);
return crq->active->queue[prio].head;
}
return NULL; // only idle task left
}
void sched_enqueue_ready(struct process* p) {
if (!p) return;
uint32_t target_cpu = 0;
int need_ipi = 0;
uintptr_t flags = spin_lock_irqsave(&sched_lock);
sleep_queue_remove(p);
if (p->state == PROCESS_READY) {
target_cpu = p->cpu_id < SCHED_MAX_CPUS ? p->cpu_id : 0;
rq_enqueue(pcpu_rq[target_cpu].active, p);
sched_pcpu_inc_load(target_cpu);
need_ipi = 1;
}
spin_unlock_irqrestore(&sched_lock, flags);
if (need_ipi) sched_ipi_resched(target_cpu);
}
void thread_wrapper(void (*fn)(void));
static struct process* process_find_locked(uint32_t pid) {
if (!ready_queue_head) return NULL;
struct process* it = ready_queue_head;
const struct process* const start = it;
do {
if (it->pid == pid) return it;
it = it->next;
} while (it && it != start);
return NULL;
}
static void process_reap_locked(struct process* p) {
if (!p) return;
if (p->pid == 0) return;
/* Safety net: ensure process is not in any runqueue/sleep/alarm queue before freeing */
rq_remove_if_queued(p);
sleep_queue_remove(p);
alarm_queue_remove(p);
if (p == ready_queue_head && p == ready_queue_tail) {
return;
}
if (p->next) {
p->next->prev = p->prev;
}
if (p->prev) {
p->prev->next = p->next;
}
if (p == ready_queue_head) {
ready_queue_head = p->next;
}
if (p == ready_queue_tail) {
ready_queue_tail = p->prev;
}
if (p->kernel_stack) {
kstack_free(p->kernel_stack);
p->kernel_stack = NULL;
}
if (p->addr_space && p->addr_space != kernel_as) {
/* Threads share addr_space with group leader; don't destroy it */
if (!(p->flags & PROCESS_FLAG_THREAD)) {
vmm_as_destroy(p->addr_space);
}
p->addr_space = 0;
}
kfree(p);
}
static void process_close_all_files_locked(struct process* p) {
if (!p) return;
for (int fd = 0; fd < PROCESS_MAX_FILES; fd++) {
struct file* f = p->files[fd];
if (!f) continue;
p->files[fd] = NULL;
if (__sync_sub_and_fetch(&f->refcount, 1) == 0) {
if (f->node) {
vfs_close(f->node);
}
kfree(f);
}
}
}
int process_kill(uint32_t pid, int sig) {
const int SIG_KILL = 9;
if (pid == 0) return -EINVAL;
if (sig <= 0 || sig >= PROCESS_MAX_SIG) return -EINVAL;
if (current_process && current_process->pid == pid && sig == SIG_KILL) {
process_close_all_files_locked(current_process);
process_exit_notify(128 + sig);
hal_cpu_enable_interrupts();
schedule();
for (;;) hal_cpu_idle();
}
uintptr_t flags = spin_lock_irqsave(&sched_lock);
struct process* p = process_find_locked(pid);
if (!p || p->pid == 0) {
spin_unlock_irqrestore(&sched_lock, flags);
return -ESRCH;
}
if (p->state == PROCESS_ZOMBIE) {
spin_unlock_irqrestore(&sched_lock, flags);
return 0;
}
if (sig == SIG_KILL) {
/* Remove from runqueue/sleep queue BEFORE marking ZOMBIE */
if (p->state == PROCESS_READY) {
rq_remove_if_queued(p);
}
sleep_queue_remove(p);
alarm_queue_remove(p);
process_close_all_files_locked(p);
p->exit_status = 128 + sig;
p->state = PROCESS_ZOMBIE;
if (p->pid != 0) {
struct process* parent = process_find_locked(p->parent_pid);
if (parent && parent->state == PROCESS_BLOCKED && parent->waiting) {
if (parent->wait_pid == -1 || parent->wait_pid == (int)p->pid) {
parent->wait_result_pid = (int)p->pid;
parent->wait_result_status = p->exit_status;
parent->state = PROCESS_READY;
rq_enqueue(pcpu_rq[parent->cpu_id].active, parent);
}
}
}
} else {
p->sig_pending_mask |= (1U << (uint32_t)sig);
if (p->state == PROCESS_BLOCKED || p->state == PROCESS_SLEEPING) {
sleep_queue_remove(p);
p->state = PROCESS_READY;
rq_enqueue(pcpu_rq[p->cpu_id].active, p);
}
}
spin_unlock_irqrestore(&sched_lock, flags);
return 0;
}
int process_kill_pgrp(uint32_t pgrp, int sig) {
if (pgrp == 0) return -EINVAL;
if (sig <= 0 || sig >= PROCESS_MAX_SIG) return -EINVAL;
uintptr_t flags = spin_lock_irqsave(&sched_lock);
int found = 0;
struct process* it = ready_queue_head;
if (it) {
const struct process* const start = it;
do {
if (it->pgrp_id == pgrp && it->pid != 0 && it->state != PROCESS_ZOMBIE) {
it->sig_pending_mask |= (1U << (uint32_t)sig);
if (it->state == PROCESS_BLOCKED || it->state == PROCESS_SLEEPING) {
sleep_queue_remove(it);
it->state = PROCESS_READY;
rq_enqueue(pcpu_rq[it->cpu_id].active, it);
}
found = 1;
}
it = it->next;
} while (it && it != start);
}
spin_unlock_irqrestore(&sched_lock, flags);
return found ? 0 : -ESRCH;
}
int process_waitpid(int pid, int* status_out, uint32_t options) {
if (!current_process) return -ECHILD;
const uint32_t WNOHANG = 1U;
while (1) {
uintptr_t flags = spin_lock_irqsave(&sched_lock);
struct process* it = ready_queue_head;
struct process* start = it;
int found_child = 0;
if (it) {
do {
if (it->parent_pid == current_process->pid) {
found_child = 1;
if (pid == -1 || (int)it->pid == pid) {
if (it->state == PROCESS_ZOMBIE) {
int retpid = (int)it->pid;
int st = it->exit_status;
process_reap_locked(it);
spin_unlock_irqrestore(&sched_lock, flags);
if (status_out) *status_out = st;
return retpid;
}
}
}
it = it->next;
} while (it && it != start);
}
if (!found_child) {
spin_unlock_irqrestore(&sched_lock, flags);
return -ECHILD;
}
if ((options & WNOHANG) != 0) {
spin_unlock_irqrestore(&sched_lock, flags);
return 0;
}
current_process->waiting = 1;
current_process->wait_pid = pid;
current_process->wait_result_pid = -1;
current_process->state = PROCESS_BLOCKED;
spin_unlock_irqrestore(&sched_lock, flags);
hal_cpu_enable_interrupts();
schedule();
if (current_process->wait_result_pid != -1) {
int rp = current_process->wait_result_pid;
int st = current_process->wait_result_status;
uintptr_t flags2 = spin_lock_irqsave(&sched_lock);
struct process* child = process_find_locked((uint32_t)rp);
if (child && child->parent_pid == current_process->pid && child->state == PROCESS_ZOMBIE) {
process_reap_locked(child);
}
spin_unlock_irqrestore(&sched_lock, flags2);
current_process->waiting = 0;
current_process->wait_pid = -1;
current_process->wait_result_pid = -1;
if (status_out) *status_out = st;
return rp;
}
}
}
void sched_set_init_pid(uint32_t pid) {
init_pid = pid;
}
void sched_assign_pid1(struct process* p) {
if (!p) return;
uintptr_t flags = spin_lock_irqsave(&sched_lock);
p->pid = 1;
p->tgid = 1;
init_pid = 1;
spin_unlock_irqrestore(&sched_lock, flags);
}
void process_exit_notify(int status) {
if (!current_process) return;
uintptr_t flags = spin_lock_irqsave(&sched_lock);
current_process->exit_status = status;
current_process->state = PROCESS_ZOMBIE;
alarm_queue_remove(current_process);
/* Reparent children to the init process so orphaned zombies can be reaped.
* If init is waiting on pid==-1, wake it to collect newly-adopted zombies. */
{
struct process* init_proc = init_pid ? process_find_locked(init_pid) : NULL;
struct process* it = ready_queue_head;
if (it && init_proc) {
const struct process* const start = it;
do {
if (it->parent_pid == current_process->pid && it != current_process) {
it->parent_pid = init_pid;
/* If the child is already a zombie and init is waiting, wake init */
if (it->state == PROCESS_ZOMBIE &&
init_proc->state == PROCESS_BLOCKED && init_proc->waiting &&
init_proc->wait_pid == -1) {
init_proc->wait_result_pid = (int)it->pid;
init_proc->wait_result_status = it->exit_status;
init_proc->state = PROCESS_READY;
rq_enqueue(pcpu_rq[init_proc->cpu_id].active, init_proc);
}
}
it = it->next;
} while (it && it != start);
}
}
uint32_t wake_cpu = (uint32_t)-1;
if (current_process->pid != 0) {
struct process* parent = process_find_locked(current_process->parent_pid);
if (parent && parent->state == PROCESS_BLOCKED && parent->waiting) {
if (parent->wait_pid == -1 || parent->wait_pid == (int)current_process->pid) {
parent->wait_result_pid = (int)current_process->pid;
parent->wait_result_status = status;
parent->state = PROCESS_READY;
uint32_t pcpu = parent->cpu_id < SCHED_MAX_CPUS ? parent->cpu_id : 0;
rq_enqueue(pcpu_rq[pcpu].active, parent);
sched_pcpu_inc_load(pcpu);
wake_cpu = pcpu;
}
}
}
spin_unlock_irqrestore(&sched_lock, flags);
if (wake_cpu != (uint32_t)-1) sched_ipi_resched(wake_cpu);
}
static void fork_child_trampoline(void) {
if (!current_process || !current_process->has_user_regs) {
process_exit_notify(1);
schedule();
for (;;) hal_cpu_idle();
}
if (current_process->addr_space) {
vmm_as_activate(current_process->addr_space);
}
hal_usermode_enter_regs(current_process->user_regs);
}
struct process* process_fork_create(uintptr_t child_as, const void* child_regs) {
if (!child_as || !child_regs) return NULL;
uintptr_t flags = spin_lock_irqsave(&sched_lock);
struct process* proc = (struct process*)kmalloc(sizeof(*proc));
if (!proc) {
spin_unlock_irqrestore(&sched_lock, flags);
return NULL;
}
memset(proc, 0, sizeof(*proc));
proc->pid = next_pid++;
proc->parent_pid = current_process ? current_process->pid : 0;
proc->session_id = current_process ? current_process->session_id : proc->pid;
proc->pgrp_id = current_process ? current_process->pgrp_id : proc->pid;
proc->uid = current_process ? current_process->uid : 0;
proc->gid = current_process ? current_process->gid : 0;
proc->euid = current_process ? current_process->euid : 0;
proc->egid = current_process ? current_process->egid : 0;
proc->priority = current_process ? current_process->priority : SCHED_DEFAULT_PRIO;
proc->nice = current_process ? current_process->nice : 0;
proc->state = PROCESS_READY;
proc->addr_space = child_as;
proc->wake_at_tick = 0;
proc->exit_status = 0;
proc->waiting = 0;
proc->wait_pid = -1;
proc->wait_result_pid = -1;
proc->wait_result_status = 0;
if (current_process) {
strcpy(proc->cwd, current_process->cwd);
} else {
strcpy(proc->cwd, "/");
}
proc->has_user_regs = 1;
memcpy(proc->user_regs, child_regs, ARCH_REGS_SIZE);
proc->tgid = proc->pid;
proc->flags = 0;
proc->tls_base = 0;
proc->clear_child_tid = NULL;
uint32_t fork_target = sched_pcpu_least_loaded();
proc->cpu_id = fork_target;
if (current_process) {
memcpy(proc->fpu_state, current_process->fpu_state, FPU_STATE_SIZE);
} else {
arch_fpu_init_state(proc->fpu_state);
}
/* Copy parent's file descriptors under sched_lock so the child
* cannot be scheduled before its FD table is fully populated.
* Refcounts are bumped atomically for each shared struct file. */
if (current_process) {
for (int i = 0; i < PROCESS_MAX_FILES; i++) {
struct file* f = current_process->files[i];
if (f) {
__sync_fetch_and_add(&f->refcount, 1);
proc->files[i] = f;
proc->fd_flags[i] = current_process->fd_flags[i];
} else {
proc->files[i] = NULL;
}
}
} else {
for (int i = 0; i < PROCESS_MAX_FILES; i++)
proc->files[i] = NULL;
}
for (int i = 0; i < PROCESS_MAX_MMAPS; i++) {
proc->mmaps[i] = current_process ? current_process->mmaps[i]
: (typeof(proc->mmaps[i])){0, 0, -1};
}
void* stack = kstack_alloc();
if (!stack) {
/* Undo FD refcount bumps on failure */
if (current_process) {
for (int i = 0; i < PROCESS_MAX_FILES; i++) {
if (proc->files[i])
__sync_sub_and_fetch(&proc->files[i]->refcount, 1);
}
}
kfree(proc);
spin_unlock_irqrestore(&sched_lock, flags);
return NULL;
}
proc->kernel_stack = (uint32_t*)stack;
proc->sp = arch_kstack_init((uint8_t*)stack + KSTACK_SIZE,
thread_wrapper, fork_child_trampoline);
proc->next = ready_queue_head;
proc->prev = ready_queue_tail;
ready_queue_tail->next = proc;
ready_queue_head->prev = proc;
ready_queue_tail = proc;
rq_enqueue(pcpu_rq[fork_target].active, proc);
sched_pcpu_inc_load(fork_target);
spin_unlock_irqrestore(&sched_lock, flags);
sched_ipi_resched(fork_target);
return proc;
}
static void clone_child_trampoline(void) {
if (!current_process || !current_process->has_user_regs) {
process_exit_notify(1);
schedule();
for (;;) hal_cpu_idle();
}
/* Activate the shared address space */
if (current_process->addr_space) {
vmm_as_activate(current_process->addr_space);
}
/* Load user TLS into GS if set */
if (current_process->tls_base) {
hal_cpu_set_tls(current_process->tls_base);
}
hal_usermode_enter_regs(current_process->user_regs);
}
struct process* process_clone_create(uint32_t clone_flags,
uintptr_t child_stack,
const void* child_regs,
uintptr_t tls_base) {
if (!child_regs || !current_process) return NULL;
uintptr_t flags = spin_lock_irqsave(&sched_lock);
struct process* proc = (struct process*)kmalloc(sizeof(*proc));
if (!proc) {
spin_unlock_irqrestore(&sched_lock, flags);
return NULL;
}
memset(proc, 0, sizeof(*proc));
proc->pid = next_pid++;
proc->parent_pid = current_process->pid;
proc->session_id = current_process->session_id;
proc->pgrp_id = current_process->pgrp_id;
proc->priority = current_process->priority;
proc->nice = current_process->nice;
proc->state = PROCESS_READY;
proc->wake_at_tick = 0;
proc->exit_status = 0;
proc->waiting = 0;
proc->wait_pid = -1;
proc->wait_result_pid = -1;
proc->wait_result_status = 0;
/* CLONE_VM: share address space */
if (clone_flags & CLONE_VM) {
proc->addr_space = current_process->addr_space;
proc->flags |= PROCESS_FLAG_THREAD;
} else {
proc->addr_space = vmm_as_clone_user_cow(current_process->addr_space);
if (!proc->addr_space) {
kfree(proc);
spin_unlock_irqrestore(&sched_lock, flags);
return NULL;
}
}
/* CLONE_THREAD: same thread group */
if (clone_flags & CLONE_THREAD) {
proc->tgid = current_process->tgid;
} else {
proc->tgid = proc->pid;
}
/* CLONE_FS: share cwd */
strcpy(proc->cwd, current_process->cwd);
/* CLONE_FILES: share file descriptor table */
if (clone_flags & CLONE_FILES) {
for (int i = 0; i < PROCESS_MAX_FILES; i++) {
proc->files[i] = current_process->files[i];
if (proc->files[i]) {
__sync_fetch_and_add(&proc->files[i]->refcount, 1);
}
proc->fd_flags[i] = current_process->fd_flags[i];
}
}
/* CLONE_SIGHAND: share signal handlers */
if (clone_flags & CLONE_SIGHAND) {
for (int i = 0; i < PROCESS_MAX_SIG; i++) {
proc->sigactions[i] = current_process->sigactions[i];
}
}
/* CLONE_SETTLS: set TLS base */
if (clone_flags & CLONE_SETTLS) {
proc->tls_base = tls_base;
}
proc->uid = current_process->uid;
proc->gid = current_process->gid;
proc->euid = current_process->euid;
proc->egid = current_process->egid;
proc->heap_start = current_process->heap_start;
proc->heap_break = current_process->heap_break;
memcpy(proc->rlimits, current_process->rlimits, sizeof(proc->rlimits));
memcpy(proc->fpu_state, current_process->fpu_state, FPU_STATE_SIZE);
for (int i = 0; i < PROCESS_MAX_MMAPS; i++) {
proc->mmaps[i] = current_process->mmaps[i];
}
proc->has_user_regs = 1;
memcpy(proc->user_regs, child_regs, ARCH_REGS_SIZE);
arch_regs_set_retval(proc->user_regs, 0); /* child returns 0 */
/* If child_stack specified, override user stack pointer */
if (child_stack) {
arch_regs_set_ustack(proc->user_regs, child_stack);
}
/* Allocate kernel stack */
void* kstack = kstack_alloc();
if (!kstack) {
if (!(clone_flags & CLONE_VM) && proc->addr_space) {
vmm_as_destroy(proc->addr_space);
}
kfree(proc);
spin_unlock_irqrestore(&sched_lock, flags);
return NULL;
}
proc->kernel_stack = (uint32_t*)kstack;
proc->sp = arch_kstack_init((uint8_t*)kstack + KSTACK_SIZE,
thread_wrapper, clone_child_trampoline);
uint32_t clone_target = sched_pcpu_least_loaded();
proc->cpu_id = clone_target;
/* Insert into process list */
proc->next = ready_queue_head;
proc->prev = ready_queue_tail;
ready_queue_tail->next = proc;
ready_queue_head->prev = proc;
ready_queue_tail = proc;
rq_enqueue(pcpu_rq[clone_target].active, proc);
sched_pcpu_inc_load(clone_target);
spin_unlock_irqrestore(&sched_lock, flags);
sched_ipi_resched(clone_target);
return proc;
}
struct process* process_find_by_pid(uint32_t pid) {
uintptr_t flags = spin_lock_irqsave(&sched_lock);
struct process* p = process_find_locked(pid);
spin_unlock_irqrestore(&sched_lock, flags);
return p;
}
void process_init(void) {
kprintf("[SCHED] Initializing Multitasking...\n");
uintptr_t flags = spin_lock_irqsave(&sched_lock);
// Initial Kernel Thread (PID 0) - IDLE TASK
struct process* kernel_proc = (struct process*)kmalloc(sizeof(*kernel_proc));
if (!kernel_proc) {
spin_unlock_irqrestore(&sched_lock, flags);
kprintf("[SCHED] OOM allocating kernel process struct.\n");
for(;;) hal_cpu_idle();
__builtin_unreachable();
}
memset(kernel_proc, 0, sizeof(*kernel_proc));
/* Initialize per-CPU runqueues */
for (uint32_t c = 0; c < SCHED_MAX_CPUS; c++) {
memset(&pcpu_rq[c], 0, sizeof(pcpu_rq[c]));
pcpu_rq[c].active = &pcpu_rq[c].stores[0];
pcpu_rq[c].expired = &pcpu_rq[c].stores[1];
pcpu_rq[c].idle = NULL;
}
kernel_proc->pid = 0;
kernel_proc->parent_pid = 0;
kernel_proc->session_id = 0;
kernel_proc->pgrp_id = 0;
kernel_proc->priority = SCHED_NUM_PRIOS - 1; // idle = lowest priority
kernel_proc->nice = 19;
kernel_proc->state = PROCESS_RUNNING;
kernel_proc->wake_at_tick = 0;
kernel_proc->addr_space = hal_cpu_get_address_space();
kernel_as = kernel_proc->addr_space;
kernel_proc->exit_status = 0;
kernel_proc->waiting = 0;
kernel_proc->wait_pid = -1;
kernel_proc->wait_result_pid = -1;
kernel_proc->wait_result_status = 0;
strcpy(kernel_proc->cwd, "/");
for (int i = 0; i < PROCESS_MAX_FILES; i++) {
kernel_proc->files[i] = NULL;
}
for (int i = 0; i < PROCESS_MAX_MMAPS; i++) {
kernel_proc->mmaps[i].shmid = -1;
}
kernel_proc->tgid = 0;
kernel_proc->flags = 0;
kernel_proc->tls_base = 0;
kernel_proc->clear_child_tid = NULL;
kernel_proc->cpu_id = 0;
arch_fpu_init_state(kernel_proc->fpu_state);
/* Allocate a dedicated kernel stack for PID 0 with guard page. */
void* kstack0 = kstack_alloc();
if (!kstack0) {
spin_unlock_irqrestore(&sched_lock, flags);
kprintf("[SCHED] OOM allocating PID 0 kernel stack.\n");
for (;;) hal_cpu_idle();
__builtin_unreachable();
}
kernel_proc->kernel_stack = (uint32_t*)kstack0;
pcpu_rq[0].idle = kernel_proc;
percpu_set_current(kernel_proc);
ready_queue_head = kernel_proc;
ready_queue_tail = kernel_proc;
kernel_proc->next = kernel_proc;
kernel_proc->prev = kernel_proc;
hal_cpu_set_kernel_stack((uintptr_t)kstack0 + KSTACK_SIZE);
spin_unlock_irqrestore(&sched_lock, flags);
}
void sched_ap_init(uint32_t cpu) {
if (cpu == 0 || cpu >= SCHED_MAX_CPUS) return;
/* Allocate OUTSIDE sched_lock to avoid ABBA deadlock with heap lock:
* AP: sched_lock → heap_lock (kmalloc)
* BSP: heap_lock → timer ISR → sched_lock ← deadlock */
struct process* idle = (struct process*)kmalloc(sizeof(*idle));
if (!idle) {
kprintf("[SCHED] CPU%u: OOM allocating idle process.\n", cpu);
return;
}
memset(idle, 0, sizeof(*idle));
void* kstack = kstack_alloc();
if (!kstack) {
kfree(idle);
kprintf("[SCHED] CPU%u: OOM allocating idle kstack.\n", cpu);
return;
}
/* Fill in idle process fields (no lock needed — not yet visible) */
idle->parent_pid = 0;
idle->priority = SCHED_NUM_PRIOS - 1;
idle->nice = 19;
idle->state = PROCESS_RUNNING;
idle->addr_space = kernel_as;
idle->cpu_id = cpu;
strcpy(idle->cwd, "/");
for (int i = 0; i < PROCESS_MAX_MMAPS; i++)
idle->mmaps[i].shmid = -1;
idle->kernel_stack = (uint32_t*)kstack;
arch_fpu_init_state(idle->fpu_state);
/* Take sched_lock only for PID assignment + list insertion */
uintptr_t flags = spin_lock_irqsave(&sched_lock);
idle->pid = next_pid++;
idle->tgid = idle->pid;
/* Insert into global process list */
idle->next = ready_queue_head;
idle->prev = ready_queue_tail;
ready_queue_tail->next = idle;
ready_queue_head->prev = idle;
ready_queue_tail = idle;
/* Register as this CPU's idle process and current process */
pcpu_rq[cpu].idle = idle;
percpu_set_current(idle);
spin_unlock_irqrestore(&sched_lock, flags);
kprintf("[SCHED] CPU%u idle process PID %u ready.\n", cpu, idle->pid);
}
extern spinlock_t sched_lock;
void thread_wrapper(void (*fn)(void)) {
/* We arrive here from context_switch while schedule() still holds
* sched_lock with interrupts disabled. Release the lock and
* enable interrupts so the new thread can run normally. */
spin_unlock(&sched_lock);
hal_cpu_enable_interrupts();
fn();
for(;;) hal_cpu_idle();
}
struct process* process_create_kernel(void (*entry_point)(void)) {
uintptr_t flags = spin_lock_irqsave(&sched_lock);
struct process* proc = (struct process*)kmalloc(sizeof(*proc));
if (!proc) {
spin_unlock_irqrestore(&sched_lock, flags);
return NULL;
}
memset(proc, 0, sizeof(*proc));
proc->pid = next_pid++;
proc->parent_pid = current_process ? current_process->pid : 0;
proc->session_id = current_process ? current_process->session_id : proc->pid;
proc->pgrp_id = current_process ? current_process->pgrp_id : proc->pid;
proc->priority = SCHED_DEFAULT_PRIO;
proc->nice = 0;
proc->state = PROCESS_READY;
proc->addr_space = kernel_as ? kernel_as : (current_process ? current_process->addr_space : 0);
proc->wake_at_tick = 0;
proc->exit_status = 0;
proc->waiting = 0;
proc->wait_pid = -1;
proc->wait_result_pid = -1;
proc->wait_result_status = 0;
proc->tgid = proc->pid;
proc->flags = 0;
proc->tls_base = 0;
proc->clear_child_tid = NULL;
uint32_t target = sched_pcpu_least_loaded();
proc->cpu_id = target;
arch_fpu_init_state(proc->fpu_state);
for (int i = 0; i < PROCESS_MAX_FILES; i++) {
proc->files[i] = NULL;
}
for (int i = 0; i < PROCESS_MAX_MMAPS; i++) {
proc->mmaps[i].shmid = -1;
}
void* stack = kstack_alloc();
if (!stack) {
kfree(proc);
spin_unlock_irqrestore(&sched_lock, flags);
return NULL;
}
proc->kernel_stack = (uint32_t*)stack;
proc->sp = arch_kstack_init((uint8_t*)stack + KSTACK_SIZE,
thread_wrapper, entry_point);
proc->next = ready_queue_head;
proc->prev = ready_queue_tail;
ready_queue_tail->next = proc;
ready_queue_head->prev = proc;
ready_queue_tail = proc;
rq_enqueue(pcpu_rq[target].active, proc);
sched_pcpu_inc_load(target);
spin_unlock_irqrestore(&sched_lock, flags);
sched_ipi_resched(target);
return proc;
}
void schedule(void) {
uintptr_t irq_flags = spin_lock_irqsave(&sched_lock);
if (!current_process) {
spin_unlock_irqrestore(&sched_lock, irq_flags);
return;
}
uint32_t cpu = percpu_cpu_index();
struct cpu_rq *crq = &pcpu_rq[cpu];
struct process* prev = current_process;
// Time-slice preemption: if the process is still running (timer
// preemption, not a voluntary yield) and has quantum left, do NOT
// preempt. Woken processes accumulate in active and get their
// turn when the slice expires. This limits context-switch rate to
// TIMER_HZ/SCHED_TIME_SLICE while keeping full tick resolution for
// sleep/wake timing.
if (prev->state == PROCESS_RUNNING) {
if (prev->time_slice > 0) {
prev->time_slice--;
spin_unlock_irqrestore(&sched_lock, irq_flags);
return;
}
// Slice exhausted — enqueue to expired with priority decay.
prev->state = PROCESS_READY;
if (prev->priority < SCHED_NUM_PRIOS - 1) prev->priority++;
rq_enqueue(crq->expired, prev);
} else if (prev->state == PROCESS_SLEEPING && !prev->in_sleep_queue) {
/* Deferred sleep queue insertion: the caller set SLEEPING + wake_at_tick
* under its own lock (e.g. semaphore), then called schedule().
* We insert here under sched_lock — no preemption window. */
sleep_queue_remove(prev); /* defensive */
sleep_queue_insert(prev);
}
// Pick highest-priority READY process from this CPU's runqueues (O(1) bitmap).
// rq_pick_next() may swap active/expired internally.
struct process* next = rq_pick_next(cpu);
if (next) {
// next came from active — safe to dequeue.
rq_dequeue(crq->active, next);
sched_pcpu_dec_load(cpu);
} else {
// Nothing in runqueues.
if (prev->state == PROCESS_READY) {
// prev was just enqueued to expired — pull it back.
rq_dequeue(crq->expired, prev);
next = prev;
} else {
// Fall back to this CPU's idle process.
if (crq->idle) {
next = crq->idle;
} else {
// Legacy fallback: find PID 0 in process list.
struct process* it = ready_queue_head;
next = it;
if (it) {
const struct process* start = it;
do {
if (it->pid == 0) { next = it; break; }
it = it->next;
} while (it && it != start);
}
}
}
}
if (prev == next) {
prev->state = PROCESS_RUNNING;
prev->time_slice = SCHED_TIME_SLICE;
spin_unlock_irqrestore(&sched_lock, irq_flags);
return;
}
percpu_set_current(next);
current_process->state = PROCESS_RUNNING;
current_process->time_slice = SCHED_TIME_SLICE;
if (current_process->addr_space && current_process->addr_space != prev->addr_space) {
hal_cpu_set_address_space(current_process->addr_space);
}
/* Update this CPU's TSS kernel stack so ring 3→0 transitions
* use the correct per-task kernel stack. Each CPU has its own TSS. */
if (current_process->kernel_stack) {
hal_cpu_set_kernel_stack((uintptr_t)current_process->kernel_stack + KSTACK_SIZE);
}
/* context_switch MUST execute with the lock held and interrupts
* disabled. Otherwise a timer firing between unlock and
* context_switch would call schedule() again while current_process
* is already set to `next` but we're still on prev's stack,
* corrupting next->sp.
*
* After context_switch we are on the new process's stack.
* irq_flags now holds the value saved during THIS process's
* previous spin_lock_irqsave in schedule(). Releasing the
* lock restores the correct interrupt state.
*
* For brand-new processes, context_switch's `ret` goes to
* thread_wrapper which releases the lock explicitly. */
arch_fpu_save(prev->fpu_state);
context_switch(&prev->sp, current_process->sp);
arch_fpu_restore(current_process->fpu_state);
spin_unlock_irqrestore(&sched_lock, irq_flags);
}
void sched_sleep_enqueue_self(void) {
/* No-op: schedule() now handles deferred sleep queue insertion
* atomically under sched_lock, eliminating preemption windows. */
}
void process_sleep(uint32_t ticks) {
extern uint32_t get_tick_count(void);
uint32_t current_tick = get_tick_count();
uintptr_t flags = spin_lock_irqsave(&sched_lock);
current_process->wake_at_tick = current_tick + ticks;
current_process->state = PROCESS_SLEEPING;
sleep_queue_remove(current_process); /* defensive: remove stale entry if present */
sleep_queue_insert(current_process);
spin_unlock_irqrestore(&sched_lock, flags);
schedule();
}
void process_wake_check(uint32_t current_tick) {
uintptr_t flags = spin_lock_irqsave(&sched_lock);
struct process* iter = ready_queue_head;
if (!iter) {
spin_unlock_irqrestore(&sched_lock, flags);
return;
}
/* CPU time accounting: charge one tick to the running process */
if (current_process && current_process->state == PROCESS_RUNNING) {
current_process->utime++;
}
/* O(1) sleep queue: pop expired entries from the sorted head.
* Track which remote CPUs need an IPI to pick up newly-ready work. */
uint32_t ipi_mask = 0; /* bitmask of CPUs needing IPI */
while (sleep_head && current_tick >= sleep_head->wake_at_tick) {
struct process* p = sleep_head;
sleep_queue_remove(p);
if (p->state == PROCESS_SLEEPING) {
p->state = PROCESS_READY;
if (p->priority > 0) p->priority--;
uint32_t tcpu = p->cpu_id < SCHED_MAX_CPUS ? p->cpu_id : 0;
rq_enqueue(pcpu_rq[tcpu].active, p);
sched_pcpu_inc_load(tcpu);
if (tcpu < 32) ipi_mask |= (1U << tcpu);
}
}
/* O(1) alarm queue: pop expired entries from the sorted head */
while (alarm_head && current_tick >= alarm_head->alarm_tick) {
struct process* p = alarm_head;
alarm_queue_remove(p);
p->sig_pending_mask |= (1U << 14); /* SIGALRM */
/* Re-arm repeating ITIMER_REAL */
if (p->alarm_interval != 0) {
p->alarm_tick = current_tick + p->alarm_interval;
alarm_queue_insert(p);
} else {
p->alarm_tick = 0;
}
}
/* ITIMER_VIRTUAL: decrement when running in user mode */
if (current_process && current_process->state == PROCESS_RUNNING) {
if (current_process->itimer_virt_value > 0) {
current_process->itimer_virt_value--;
if (current_process->itimer_virt_value == 0) {
current_process->sig_pending_mask |= (1U << 26); /* SIGVTALRM */
current_process->itimer_virt_value = current_process->itimer_virt_interval;
}
}
/* ITIMER_PROF: decrement when running (user + kernel) */
if (current_process->itimer_prof_value > 0) {
current_process->itimer_prof_value--;
if (current_process->itimer_prof_value == 0) {
current_process->sig_pending_mask |= (1U << 27); /* SIGPROF */
current_process->itimer_prof_value = current_process->itimer_prof_interval;
}
}
}
spin_unlock_irqrestore(&sched_lock, flags);
/* Send IPI to remote CPUs that received newly-ready processes */
uint32_t my_cpu = percpu_cpu_index();
ipi_mask &= ~(1U << my_cpu); /* no self-IPI */
while (ipi_mask) {
uint32_t c = (uint32_t)__builtin_ctz(ipi_mask);
sched_ipi_resched(c);
ipi_mask &= ~(1U << c);
}
}
void sched_ap_tick(void) {
/* Called from AP timer interrupt — per-CPU accounting.
* Sleep/alarm queues and global tick are managed by BSP. */
uintptr_t flags = spin_lock_irqsave(&sched_lock);
if (current_process && current_process->state == PROCESS_RUNNING) {
current_process->utime++;
/* ITIMER_VIRTUAL: decrement when running in user mode */
if (current_process->itimer_virt_value > 0) {
current_process->itimer_virt_value--;
if (current_process->itimer_virt_value == 0) {
current_process->sig_pending_mask |= (1U << 26); /* SIGVTALRM */
current_process->itimer_virt_value = current_process->itimer_virt_interval;
}
}
/* ITIMER_PROF: decrement when running (user + kernel) */
if (current_process->itimer_prof_value > 0) {
current_process->itimer_prof_value--;
if (current_process->itimer_prof_value == 0) {
current_process->sig_pending_mask |= (1U << 27); /* SIGPROF */
current_process->itimer_prof_value = current_process->itimer_prof_interval;
}
}
}
spin_unlock_irqrestore(&sched_lock, flags);
}
void sched_load_balance(void) {
/* Periodic work stealing: called from BSP timer tick.
* Migrates one process from the busiest CPU to the idlest CPU
* if the load imbalance exceeds a threshold. */
uint32_t ncpus = sched_pcpu_count();
if (ncpus <= 1) return;
uint32_t max_cpu = 0, min_cpu = 0;
uint32_t max_load = 0, min_load = (uint32_t)-1;
for (uint32_t i = 0; i < ncpus && i < SCHED_MAX_CPUS; i++) {
uint32_t load = sched_pcpu_get_load(i);
if (load > max_load) { max_load = load; max_cpu = i; }
if (load < min_load) { min_load = load; min_cpu = i; }
}
/* Only migrate if imbalance >= 2 (avoids ping-pong) */
if (max_cpu == min_cpu || max_load < min_load + 2) return;
uintptr_t flags = spin_lock_irqsave(&sched_lock);
struct cpu_rq *src = &pcpu_rq[max_cpu];
/* Find a migratable process in the source's expired queue first,
* then active. Skip idle processes and the currently running process. */
struct process* victim = NULL;
for (int pass = 0; pass < 2 && !victim; pass++) {
struct runqueue* rq = (pass == 0) ? src->expired : src->active;
if (!rq->bitmap) continue;
/* Try lowest-priority queue first (least disruptive) */
for (int prio = SCHED_NUM_PRIOS - 1; prio >= 0 && !victim; prio--) {
if (!(rq->bitmap & (1U << prio))) continue;
struct process* p = rq->queue[prio].head;
while (p) {
if (p != src->idle && p->state == PROCESS_READY) {
victim = p;
rq_dequeue(rq, p);
break;
}
p = p->rq_next;
}
}
}
if (victim) {
victim->cpu_id = min_cpu;
rq_enqueue(pcpu_rq[min_cpu].active, victim);
sched_pcpu_dec_load(max_cpu);
sched_pcpu_inc_load(min_cpu);
}
spin_unlock_irqrestore(&sched_lock, flags);
if (victim) sched_ipi_resched(min_cpu);
}
uint32_t process_alarm_set(struct process* p, uint32_t tick) {
uintptr_t flags = spin_lock_irqsave(&sched_lock);
uint32_t old = p->alarm_tick;
/* Remove from alarm queue if currently queued */
alarm_queue_remove(p);
if (tick != 0) {
p->alarm_tick = tick;
alarm_queue_insert(p);
} else {
p->alarm_tick = 0;
}
spin_unlock_irqrestore(&sched_lock, flags);
return old;
}