From: Tulio A M Mendes Date: Sat, 14 Mar 2026 13:23:54 +0000 (-0300) Subject: feat: full SMP scheduling — AP tick accounting, IPI wakeups, load balancing X-Git-Url: https://projects.tadryanom.me/docs/POSIX_ROADMAP.md?a=commitdiff_plain;h=1374a6fc9217cf7236b0eed671ae0471db0a00d8;p=AdrOS.git feat: full SMP scheduling — AP tick accounting, IPI wakeups, load balancing Scheduler changes (src/kernel/scheduler.c): - sched_ap_tick(): per-CPU tick accounting for APs (utime, ITIMER_VIRTUAL, ITIMER_PROF) — previously only BSP tracked CPU time - sched_load_balance(): periodic work stealing — migrates one process from busiest to idlest CPU when imbalance >= 2 (avoids ping-pong) - IPI resched after sleep wakeups: process_wake_check() now sends IPI to remote CPUs that received newly-ready processes from the sleep queue - IPI resched after parent wakeup: process_exit_notify() sends IPI when waking a parent blocked in waitpid on a different CPU - Load counter (sched_pcpu_inc_load) added to wakeup paths that were missing it (exit_notify parent wake, sleep queue wake) Timer changes: - src/drivers/timer.c: hal_tick_bridge() now calls sched_ap_tick() on APs and sched_load_balance() on BSP every 10 ticks (~200ms at 50Hz) - src/hal/x86/timer.c: APs now go through hal_tick_bridge instead of calling bare schedule(), enabling proper AP tick accounting - Uses percpu_cpu_index() (GS segment read) instead of smp_current_cpu() (LAPIC ID linear scan) for faster CPU identification Tests: - New SMP parallel fork smoke test: forks 8 children with busy loops, verifies all complete with correct exit status (exercises multi-CPU scheduling, IPI wakeups, and load balancing) - 102/102 smoke tests pass, cppcheck clean, 64/64 host tests pass --- diff --git a/include/process.h b/include/process.h index c977d7d..e22f3d1 100644 --- a/include/process.h +++ b/include/process.h @@ -218,4 +218,10 @@ struct process* process_find_by_pid(uint32_t pid); // Skips insertion if the process is no longer PROCESS_SLEEPING. void sched_sleep_enqueue_self(void); +// Per-CPU tick accounting for AP cores (utime, itimers). +void sched_ap_tick(void); + +// Periodic load balancing: migrate one process from busiest to idlest CPU. +void sched_load_balance(void); + #endif diff --git a/src/drivers/timer.c b/src/drivers/timer.c index 1ee47a5..274e313 100644 --- a/src/drivers/timer.c +++ b/src/drivers/timer.c @@ -7,6 +7,10 @@ #include "hal/timer.h" #include "hal/uart.h" +#ifdef __i386__ +#include "arch/x86/percpu.h" +#endif + static uint32_t tick = 0; /* TSC-based nanosecond timekeeping */ @@ -50,10 +54,12 @@ uint64_t clock_gettime_ns(void) { return sec_part + frac_part; } +static uint32_t lb_counter = 0; +#define LOAD_BALANCE_INTERVAL 10 /* every 10 ticks (~200ms at 50Hz) */ + static void hal_tick_bridge(void) { #ifdef __i386__ - extern uint32_t smp_current_cpu(void); - uint32_t cpu = smp_current_cpu(); + uint32_t cpu = percpu_cpu_index(); #else uint32_t cpu = 0; #endif @@ -65,6 +71,15 @@ static void hal_tick_bridge(void) { vga_flush(); hal_uart_poll_rx(); process_wake_check(tick); + + /* Periodic load balancing */ + if (++lb_counter >= LOAD_BALANCE_INTERVAL) { + lb_counter = 0; + sched_load_balance(); + } + } else { + /* AP: per-CPU tick accounting (utime, itimers) */ + sched_ap_tick(); } /* All CPUs: run the scheduler to pick up new work */ diff --git a/src/hal/x86/timer.c b/src/hal/x86/timer.c index 6870a69..f1229a6 100644 --- a/src/hal/x86/timer.c +++ b/src/hal/x86/timer.c @@ -11,13 +11,10 @@ static hal_timer_tick_cb_t g_tick_cb = 0; static void timer_irq(struct registers* regs) { (void)regs; - if (lapic_is_enabled() && lapic_get_id() != 0) { - /* AP: only run the local scheduler — tick accounting, VGA flush, - * UART poll, and sleep-queue wake are handled by the BSP. */ - extern void schedule(void); - schedule(); - return; - } + /* All CPUs (BSP and APs) go through the tick bridge. + * The bridge dispatches BSP-only work (tick counter, sleep wakeups, + * VGA flush, load balancing) and AP-only work (sched_ap_tick), + * then calls schedule() on all CPUs. */ if (g_tick_cb) g_tick_cb(); } diff --git a/src/kernel/scheduler.c b/src/kernel/scheduler.c index d03a755..f2690a9 100644 --- a/src/kernel/scheduler.c +++ b/src/kernel/scheduler.c @@ -538,6 +538,7 @@ void process_exit_notify(int status) { } } + uint32_t wake_cpu = (uint32_t)-1; if (current_process->pid != 0) { struct process* parent = process_find_locked(current_process->parent_pid); if (parent && parent->state == PROCESS_BLOCKED && parent->waiting) { @@ -545,12 +546,16 @@ void process_exit_notify(int status) { parent->wait_result_pid = (int)current_process->pid; parent->wait_result_status = status; parent->state = PROCESS_READY; - rq_enqueue(pcpu_rq[parent->cpu_id].active, parent); + uint32_t pcpu = parent->cpu_id < SCHED_MAX_CPUS ? parent->cpu_id : 0; + rq_enqueue(pcpu_rq[pcpu].active, parent); + sched_pcpu_inc_load(pcpu); + wake_cpu = pcpu; } } } spin_unlock_irqrestore(&sched_lock, flags); + if (wake_cpu != (uint32_t)-1) sched_ipi_resched(wake_cpu); } static void fork_child_trampoline(void) { @@ -1188,14 +1193,19 @@ void process_wake_check(uint32_t current_tick) { current_process->utime++; } - /* O(1) sleep queue: pop expired entries from the sorted head */ + /* O(1) sleep queue: pop expired entries from the sorted head. + * Track which remote CPUs need an IPI to pick up newly-ready work. */ + uint32_t ipi_mask = 0; /* bitmask of CPUs needing IPI */ while (sleep_head && current_tick >= sleep_head->wake_at_tick) { struct process* p = sleep_head; sleep_queue_remove(p); if (p->state == PROCESS_SLEEPING) { p->state = PROCESS_READY; if (p->priority > 0) p->priority--; - rq_enqueue(pcpu_rq[p->cpu_id].active, p); + uint32_t tcpu = p->cpu_id < SCHED_MAX_CPUS ? p->cpu_id : 0; + rq_enqueue(pcpu_rq[tcpu].active, p); + sched_pcpu_inc_load(tcpu); + if (tcpu < 32) ipi_mask |= (1U << tcpu); } } @@ -1233,6 +1243,100 @@ void process_wake_check(uint32_t current_tick) { } spin_unlock_irqrestore(&sched_lock, flags); + + /* Send IPI to remote CPUs that received newly-ready processes */ + uint32_t my_cpu = percpu_cpu_index(); + ipi_mask &= ~(1U << my_cpu); /* no self-IPI */ + while (ipi_mask) { + uint32_t c = (uint32_t)__builtin_ctz(ipi_mask); + sched_ipi_resched(c); + ipi_mask &= ~(1U << c); + } +} + +void sched_ap_tick(void) { + /* Called from AP timer interrupt — per-CPU accounting. + * Sleep/alarm queues and global tick are managed by BSP. */ + uintptr_t flags = spin_lock_irqsave(&sched_lock); + + if (current_process && current_process->state == PROCESS_RUNNING) { + current_process->utime++; + + /* ITIMER_VIRTUAL: decrement when running in user mode */ + if (current_process->itimer_virt_value > 0) { + current_process->itimer_virt_value--; + if (current_process->itimer_virt_value == 0) { + current_process->sig_pending_mask |= (1U << 26); /* SIGVTALRM */ + current_process->itimer_virt_value = current_process->itimer_virt_interval; + } + } + /* ITIMER_PROF: decrement when running (user + kernel) */ + if (current_process->itimer_prof_value > 0) { + current_process->itimer_prof_value--; + if (current_process->itimer_prof_value == 0) { + current_process->sig_pending_mask |= (1U << 27); /* SIGPROF */ + current_process->itimer_prof_value = current_process->itimer_prof_interval; + } + } + } + + spin_unlock_irqrestore(&sched_lock, flags); +} + +void sched_load_balance(void) { + /* Periodic work stealing: called from BSP timer tick. + * Migrates one process from the busiest CPU to the idlest CPU + * if the load imbalance exceeds a threshold. */ + uint32_t ncpus = sched_pcpu_count(); + if (ncpus <= 1) return; + + uint32_t max_cpu = 0, min_cpu = 0; + uint32_t max_load = 0, min_load = (uint32_t)-1; + + for (uint32_t i = 0; i < ncpus && i < SCHED_MAX_CPUS; i++) { + uint32_t load = sched_pcpu_get_load(i); + if (load > max_load) { max_load = load; max_cpu = i; } + if (load < min_load) { min_load = load; min_cpu = i; } + } + + /* Only migrate if imbalance >= 2 (avoids ping-pong) */ + if (max_cpu == min_cpu || max_load < min_load + 2) return; + + uintptr_t flags = spin_lock_irqsave(&sched_lock); + + struct cpu_rq *src = &pcpu_rq[max_cpu]; + + /* Find a migratable process in the source's expired queue first, + * then active. Skip idle processes and the currently running process. */ + struct process* victim = NULL; + for (int pass = 0; pass < 2 && !victim; pass++) { + struct runqueue* rq = (pass == 0) ? src->expired : src->active; + if (!rq->bitmap) continue; + /* Try lowest-priority queue first (least disruptive) */ + for (int prio = SCHED_NUM_PRIOS - 1; prio >= 0 && !victim; prio--) { + if (!(rq->bitmap & (1U << prio))) continue; + struct process* p = rq->queue[prio].head; + while (p) { + if (p != src->idle && p->state == PROCESS_READY) { + victim = p; + rq_dequeue(rq, p); + break; + } + p = p->rq_next; + } + } + } + + if (victim) { + victim->cpu_id = min_cpu; + rq_enqueue(pcpu_rq[min_cpu].active, victim); + sched_pcpu_dec_load(max_cpu); + sched_pcpu_inc_load(min_cpu); + } + + spin_unlock_irqrestore(&sched_lock, flags); + + if (victim) sched_ipi_resched(min_cpu); } uint32_t process_alarm_set(struct process* p, uint32_t tick) { diff --git a/tests/smoke_test.exp b/tests/smoke_test.exp index 9239254..4c7551c 100755 --- a/tests/smoke_test.exp +++ b/tests/smoke_test.exp @@ -143,6 +143,7 @@ set tests { {"mprotect" "\\[init\\] mprotect OK"} {"getrlimit/setrlimit" "\\[init\\] getrlimit/setrlimit OK"} {"uname" "\\[init\\] uname OK"} + {"SMP parallel fork" "\\[init\\] SMP parallel fork OK"} {"LZ4 Frame decomp" "\\[INITRD\\] LZ4"} } diff --git a/user/fulltest.c b/user/fulltest.c index 4b05955..4ee9f0f 100644 --- a/user/fulltest.c +++ b/user/fulltest.c @@ -4396,6 +4396,43 @@ void _start(void) { sys_write(1, "[init] uname OK\n", (uint32_t)(sizeof("[init] uname OK\n") - 1)); } + // H1: SMP parallel fork test — exercises multi-CPU scheduling + load balancing + { + #define SMP_NCHILD 8 + int smp_pids[SMP_NCHILD]; + int smp_ok = 1; + + for (int i = 0; i < SMP_NCHILD; i++) { + int pid = sys_fork(); + if (pid == 0) { + /* Child: busy loop to consume a time slice, then exit with index */ + volatile uint32_t sum = 0; + for (uint32_t j = 0; j < 50000; j++) sum += j; + (void)sum; + sys_exit(i + 1); + } + smp_pids[i] = pid; + } + + /* Parent: wait for all children, verify each returned correct status */ + for (int i = 0; i < SMP_NCHILD; i++) { + int st = 0; + int wp = sys_waitpid(smp_pids[i], &st, 0); + if (wp != smp_pids[i] || st != (i + 1)) { + smp_ok = 0; + } + } + + if (smp_ok) { + static const char msg[] = "[init] SMP parallel fork OK\n"; + (void)sys_write(1, msg, (uint32_t)(sizeof(msg) - 1)); + } else { + static const char msg[] = "[init] SMP parallel fork FAIL\n"; + (void)sys_write(1, msg, (uint32_t)(sizeof(msg) - 1)); + } + #undef SMP_NCHILD + } + (void)sys_write(1, "[init] execve(/bin/echo)\n", (uint32_t)(sizeof("[init] execve(/bin/echo)\n") - 1)); static const char* const argv[] = {"echo", "[echo]", "hello", "from", "echo", 0};