From: Tulio A M Mendes <tadryanom@hotmail.com>
Date: Sat, 14 Mar 2026 13:23:54 +0000 (-0300)
Subject: feat: full SMP scheduling — AP tick accounting, IPI wakeups, load balancing
X-Git-Url: https://projects.tadryanom.me/docs/POSIX_ROADMAP.md?a=commitdiff_plain;h=1374a6fc9217cf7236b0eed671ae0471db0a00d8;p=AdrOS.git

feat: full SMP scheduling — AP tick accounting, IPI wakeups, load balancing

Scheduler changes (src/kernel/scheduler.c):
- sched_ap_tick(): per-CPU tick accounting for APs (utime, ITIMER_VIRTUAL,
  ITIMER_PROF) — previously only BSP tracked CPU time
- sched_load_balance(): periodic work stealing — migrates one process from
  busiest to idlest CPU when imbalance >= 2 (avoids ping-pong)
- IPI resched after sleep wakeups: process_wake_check() now sends IPI to
  remote CPUs that received newly-ready processes from the sleep queue
- IPI resched after parent wakeup: process_exit_notify() sends IPI when
  waking a parent blocked in waitpid on a different CPU
- Load counter (sched_pcpu_inc_load) added to wakeup paths that were
  missing it (exit_notify parent wake, sleep queue wake)

Timer changes:
- src/drivers/timer.c: hal_tick_bridge() now calls sched_ap_tick() on APs
  and sched_load_balance() on BSP every 10 ticks (~200ms at 50Hz)
- src/hal/x86/timer.c: APs now go through hal_tick_bridge instead of
  calling bare schedule(), enabling proper AP tick accounting
- Uses percpu_cpu_index() (GS segment read) instead of smp_current_cpu()
  (LAPIC ID linear scan) for faster CPU identification

Tests:
- New SMP parallel fork smoke test: forks 8 children with busy loops,
  verifies all complete with correct exit status (exercises multi-CPU
  scheduling, IPI wakeups, and load balancing)
- 102/102 smoke tests pass, cppcheck clean, 64/64 host tests pass
---

diff --git a/include/process.h b/include/process.h
index c977d7d..e22f3d1 100644
--- a/include/process.h
+++ b/include/process.h
@@ -218,4 +218,10 @@ struct process* process_find_by_pid(uint32_t pid);
 // Skips insertion if the process is no longer PROCESS_SLEEPING.
 void sched_sleep_enqueue_self(void);
 
+// Per-CPU tick accounting for AP cores (utime, itimers).
+void sched_ap_tick(void);
+
+// Periodic load balancing: migrate one process from busiest to idlest CPU.
+void sched_load_balance(void);
+
 #endif
diff --git a/src/drivers/timer.c b/src/drivers/timer.c
index 1ee47a5..274e313 100644
--- a/src/drivers/timer.c
+++ b/src/drivers/timer.c
@@ -7,6 +7,10 @@
 #include "hal/timer.h"
 #include "hal/uart.h"
 
+#ifdef __i386__
+#include "arch/x86/percpu.h"
+#endif
+
 static uint32_t tick = 0;
 
 /* TSC-based nanosecond timekeeping */
@@ -50,10 +54,12 @@ uint64_t clock_gettime_ns(void) {
     return sec_part + frac_part;
 }
 
+static uint32_t lb_counter = 0;
+#define LOAD_BALANCE_INTERVAL 10  /* every 10 ticks (~200ms at 50Hz) */
+
 static void hal_tick_bridge(void) {
 #ifdef __i386__
-    extern uint32_t smp_current_cpu(void);
-    uint32_t cpu = smp_current_cpu();
+    uint32_t cpu = percpu_cpu_index();
 #else
     uint32_t cpu = 0;
 #endif
@@ -65,6 +71,15 @@ static void hal_tick_bridge(void) {
         vga_flush();
         hal_uart_poll_rx();
         process_wake_check(tick);
+
+        /* Periodic load balancing */
+        if (++lb_counter >= LOAD_BALANCE_INTERVAL) {
+            lb_counter = 0;
+            sched_load_balance();
+        }
+    } else {
+        /* AP: per-CPU tick accounting (utime, itimers) */
+        sched_ap_tick();
     }
 
     /* All CPUs: run the scheduler to pick up new work */
diff --git a/src/hal/x86/timer.c b/src/hal/x86/timer.c
index 6870a69..f1229a6 100644
--- a/src/hal/x86/timer.c
+++ b/src/hal/x86/timer.c
@@ -11,13 +11,10 @@ static hal_timer_tick_cb_t g_tick_cb = 0;
 
 static void timer_irq(struct registers* regs) {
     (void)regs;
-    if (lapic_is_enabled() && lapic_get_id() != 0) {
-        /* AP: only run the local scheduler â tick accounting, VGA flush,
-         * UART poll, and sleep-queue wake are handled by the BSP. */
-        extern void schedule(void);
-        schedule();
-        return;
-    }
+    /* All CPUs (BSP and APs) go through the tick bridge.
+     * The bridge dispatches BSP-only work (tick counter, sleep wakeups,
+     * VGA flush, load balancing) and AP-only work (sched_ap_tick),
+     * then calls schedule() on all CPUs. */
     if (g_tick_cb) g_tick_cb();
 }
 
diff --git a/src/kernel/scheduler.c b/src/kernel/scheduler.c
index d03a755..f2690a9 100644
--- a/src/kernel/scheduler.c
+++ b/src/kernel/scheduler.c
@@ -538,6 +538,7 @@ void process_exit_notify(int status) {
         }
     }
 
+    uint32_t wake_cpu = (uint32_t)-1;
     if (current_process->pid != 0) {
         struct process* parent = process_find_locked(current_process->parent_pid);
         if (parent && parent->state == PROCESS_BLOCKED && parent->waiting) {
@@ -545,12 +546,16 @@ void process_exit_notify(int status) {
                 parent->wait_result_pid = (int)current_process->pid;
                 parent->wait_result_status = status;
                 parent->state = PROCESS_READY;
-                rq_enqueue(pcpu_rq[parent->cpu_id].active, parent);
+                uint32_t pcpu = parent->cpu_id < SCHED_MAX_CPUS ? parent->cpu_id : 0;
+                rq_enqueue(pcpu_rq[pcpu].active, parent);
+                sched_pcpu_inc_load(pcpu);
+                wake_cpu = pcpu;
             }
         }
     }
 
     spin_unlock_irqrestore(&sched_lock, flags);
+    if (wake_cpu != (uint32_t)-1) sched_ipi_resched(wake_cpu);
 }
 
 static void fork_child_trampoline(void) {
@@ -1188,14 +1193,19 @@ void process_wake_check(uint32_t current_tick) {
         current_process->utime++;
     }
 
-    /* O(1) sleep queue: pop expired entries from the sorted head */
+    /* O(1) sleep queue: pop expired entries from the sorted head.
+     * Track which remote CPUs need an IPI to pick up newly-ready work. */
+    uint32_t ipi_mask = 0;  /* bitmask of CPUs needing IPI */
     while (sleep_head && current_tick >= sleep_head->wake_at_tick) {
         struct process* p = sleep_head;
         sleep_queue_remove(p);
         if (p->state == PROCESS_SLEEPING) {
             p->state = PROCESS_READY;
             if (p->priority > 0) p->priority--;
-            rq_enqueue(pcpu_rq[p->cpu_id].active, p);
+            uint32_t tcpu = p->cpu_id < SCHED_MAX_CPUS ? p->cpu_id : 0;
+            rq_enqueue(pcpu_rq[tcpu].active, p);
+            sched_pcpu_inc_load(tcpu);
+            if (tcpu < 32) ipi_mask |= (1U << tcpu);
         }
     }
 
@@ -1233,6 +1243,100 @@ void process_wake_check(uint32_t current_tick) {
     }
 
     spin_unlock_irqrestore(&sched_lock, flags);
+
+    /* Send IPI to remote CPUs that received newly-ready processes */
+    uint32_t my_cpu = percpu_cpu_index();
+    ipi_mask &= ~(1U << my_cpu);  /* no self-IPI */
+    while (ipi_mask) {
+        uint32_t c = (uint32_t)__builtin_ctz(ipi_mask);
+        sched_ipi_resched(c);
+        ipi_mask &= ~(1U << c);
+    }
+}
+
+void sched_ap_tick(void) {
+    /* Called from AP timer interrupt â per-CPU accounting.
+     * Sleep/alarm queues and global tick are managed by BSP. */
+    uintptr_t flags = spin_lock_irqsave(&sched_lock);
+
+    if (current_process && current_process->state == PROCESS_RUNNING) {
+        current_process->utime++;
+
+        /* ITIMER_VIRTUAL: decrement when running in user mode */
+        if (current_process->itimer_virt_value > 0) {
+            current_process->itimer_virt_value--;
+            if (current_process->itimer_virt_value == 0) {
+                current_process->sig_pending_mask |= (1U << 26); /* SIGVTALRM */
+                current_process->itimer_virt_value = current_process->itimer_virt_interval;
+            }
+        }
+        /* ITIMER_PROF: decrement when running (user + kernel) */
+        if (current_process->itimer_prof_value > 0) {
+            current_process->itimer_prof_value--;
+            if (current_process->itimer_prof_value == 0) {
+                current_process->sig_pending_mask |= (1U << 27); /* SIGPROF */
+                current_process->itimer_prof_value = current_process->itimer_prof_interval;
+            }
+        }
+    }
+
+    spin_unlock_irqrestore(&sched_lock, flags);
+}
+
+void sched_load_balance(void) {
+    /* Periodic work stealing: called from BSP timer tick.
+     * Migrates one process from the busiest CPU to the idlest CPU
+     * if the load imbalance exceeds a threshold. */
+    uint32_t ncpus = sched_pcpu_count();
+    if (ncpus <= 1) return;
+
+    uint32_t max_cpu = 0, min_cpu = 0;
+    uint32_t max_load = 0, min_load = (uint32_t)-1;
+
+    for (uint32_t i = 0; i < ncpus && i < SCHED_MAX_CPUS; i++) {
+        uint32_t load = sched_pcpu_get_load(i);
+        if (load > max_load) { max_load = load; max_cpu = i; }
+        if (load < min_load) { min_load = load; min_cpu = i; }
+    }
+
+    /* Only migrate if imbalance >= 2 (avoids ping-pong) */
+    if (max_cpu == min_cpu || max_load < min_load + 2) return;
+
+    uintptr_t flags = spin_lock_irqsave(&sched_lock);
+
+    struct cpu_rq *src = &pcpu_rq[max_cpu];
+
+    /* Find a migratable process in the source's expired queue first,
+     * then active.  Skip idle processes and the currently running process. */
+    struct process* victim = NULL;
+    for (int pass = 0; pass < 2 && !victim; pass++) {
+        struct runqueue* rq = (pass == 0) ? src->expired : src->active;
+        if (!rq->bitmap) continue;
+        /* Try lowest-priority queue first (least disruptive) */
+        for (int prio = SCHED_NUM_PRIOS - 1; prio >= 0 && !victim; prio--) {
+            if (!(rq->bitmap & (1U << prio))) continue;
+            struct process* p = rq->queue[prio].head;
+            while (p) {
+                if (p != src->idle && p->state == PROCESS_READY) {
+                    victim = p;
+                    rq_dequeue(rq, p);
+                    break;
+                }
+                p = p->rq_next;
+            }
+        }
+    }
+
+    if (victim) {
+        victim->cpu_id = min_cpu;
+        rq_enqueue(pcpu_rq[min_cpu].active, victim);
+        sched_pcpu_dec_load(max_cpu);
+        sched_pcpu_inc_load(min_cpu);
+    }
+
+    spin_unlock_irqrestore(&sched_lock, flags);
+
+    if (victim) sched_ipi_resched(min_cpu);
 }
 
 uint32_t process_alarm_set(struct process* p, uint32_t tick) {
diff --git a/tests/smoke_test.exp b/tests/smoke_test.exp
index 9239254..4c7551c 100755
--- a/tests/smoke_test.exp
+++ b/tests/smoke_test.exp
@@ -143,6 +143,7 @@ set tests {
     {"mprotect"            "\\[init\\] mprotect OK"}
     {"getrlimit/setrlimit" "\\[init\\] getrlimit/setrlimit OK"}
     {"uname"               "\\[init\\] uname OK"}
+    {"SMP parallel fork"   "\\[init\\] SMP parallel fork OK"}
     {"LZ4 Frame decomp"    "\\[INITRD\\] LZ4"}
 }
 
diff --git a/user/fulltest.c b/user/fulltest.c
index 4b05955..4ee9f0f 100644
--- a/user/fulltest.c
+++ b/user/fulltest.c
@@ -4396,6 +4396,43 @@ void _start(void) {
         sys_write(1, "[init] uname OK\n", (uint32_t)(sizeof("[init] uname OK\n") - 1));
     }
 
+    // H1: SMP parallel fork test â exercises multi-CPU scheduling + load balancing
+    {
+        #define SMP_NCHILD 8
+        int smp_pids[SMP_NCHILD];
+        int smp_ok = 1;
+
+        for (int i = 0; i < SMP_NCHILD; i++) {
+            int pid = sys_fork();
+            if (pid == 0) {
+                /* Child: busy loop to consume a time slice, then exit with index */
+                volatile uint32_t sum = 0;
+                for (uint32_t j = 0; j < 50000; j++) sum += j;
+                (void)sum;
+                sys_exit(i + 1);
+            }
+            smp_pids[i] = pid;
+        }
+
+        /* Parent: wait for all children, verify each returned correct status */
+        for (int i = 0; i < SMP_NCHILD; i++) {
+            int st = 0;
+            int wp = sys_waitpid(smp_pids[i], &st, 0);
+            if (wp != smp_pids[i] || st != (i + 1)) {
+                smp_ok = 0;
+            }
+        }
+
+        if (smp_ok) {
+            static const char msg[] = "[init] SMP parallel fork OK\n";
+            (void)sys_write(1, msg, (uint32_t)(sizeof(msg) - 1));
+        } else {
+            static const char msg[] = "[init] SMP parallel fork FAIL\n";
+            (void)sys_write(1, msg, (uint32_t)(sizeof(msg) - 1));
+        }
+        #undef SMP_NCHILD
+    }
+
     (void)sys_write(1, "[init] execve(/bin/echo)\n",
                     (uint32_t)(sizeof("[init] execve(/bin/echo)\n") - 1));
     static const char* const argv[] = {"echo", "[echo]", "hello", "from", "echo", 0};