feat: full SMP scheduling — AP tick accounting, IPI wakeups, load balancing

author Tulio A M Mendes <[email protected]>

Sat, 14 Mar 2026 13:23:54 +0000 (10:23 -0300)

committer Tulio A M Mendes <[email protected]>

Sat, 14 Mar 2026 13:23:54 +0000 (10:23 -0300)
author Tulio A M Mendes <[email protected]>
Sat, 14 Mar 2026 13:23:54 +0000 (10:23 -0300)
committer Tulio A M Mendes <[email protected]>
Sat, 14 Mar 2026 13:23:54 +0000 (10:23 -0300)
diff --git a/include/process.h b/include/process.h

index c977d7d8bf4b65cfc7fdce6008dcda7cb454328c..e22f3d11ed46d9fa23c676bbca918ee26a2dcfd1 100644 (file)
--- a/include/process.h
+++ b/include/process.h
@@ -218,4 +218,10 @@ struct process* process_find_by_pid(uint32_t pid);
  // Skips insertion if the process is no longer PROCESS_SLEEPING.
  void sched_sleep_enqueue_self(void);
  
+// Per-CPU tick accounting for AP cores (utime, itimers).
+void sched_ap_tick(void);
+
+// Periodic load balancing: migrate one process from busiest to idlest CPU.
+void sched_load_balance(void);
+
  #endif
diff --git a/src/drivers/timer.c b/src/drivers/timer.c

index 1ee47a53dcb58364a462e7d435e51931d7559908..274e3135c4717ed1b33a871661833fb0a74d93f4 100644 (file)
--- a/src/drivers/timer.c
+++ b/src/drivers/timer.c
@@ -7,6 +7,10 @@
  #include "hal/timer.h"
  #include "hal/uart.h"
  
+#ifdef __i386__
+#include "arch/x86/percpu.h"
+#endif
+
  static uint32_t tick = 0;
  
  /* TSC-based nanosecond timekeeping */
@@ -50,10 +54,12 @@ uint64_t clock_gettime_ns(void) {
      return sec_part + frac_part;
  }
  
+static uint32_t lb_counter = 0;
+#define LOAD_BALANCE_INTERVAL 10  /* every 10 ticks (~200ms at 50Hz) */
+
  static void hal_tick_bridge(void) {
  #ifdef __i386__
-    extern uint32_t smp_current_cpu(void);
-    uint32_t cpu = smp_current_cpu();
+    uint32_t cpu = percpu_cpu_index();
  #else
      uint32_t cpu = 0;
  #endif
@@ -65,6 +71,15 @@ static void hal_tick_bridge(void) {
          vga_flush();
          hal_uart_poll_rx();
          process_wake_check(tick);
+
+        /* Periodic load balancing */
+        if (++lb_counter >= LOAD_BALANCE_INTERVAL) {
+            lb_counter = 0;
+            sched_load_balance();
+        }
+    } else {
+        /* AP: per-CPU tick accounting (utime, itimers) */
+        sched_ap_tick();
      }
  
      /* All CPUs: run the scheduler to pick up new work */
diff --git a/src/hal/x86/timer.c b/src/hal/x86/timer.c

index 6870a69d8126ddd18809db645c4d63a3f0eb1660..f1229a671869a9e0bcb386f767c657459ae96672 100644 (file)
--- a/src/hal/x86/timer.c
+++ b/src/hal/x86/timer.c
@@ -11,13 +11,10 @@ static hal_timer_tick_cb_t g_tick_cb = 0;
  
  static void timer_irq(struct registers* regs) {
      (void)regs;
-    if (lapic_is_enabled() && lapic_get_id() != 0) {
-        /* AP: only run the local scheduler — tick accounting, VGA flush,
-         * UART poll, and sleep-queue wake are handled by the BSP. */
-        extern void schedule(void);
-        schedule();
-        return;
-    }
+    /* All CPUs (BSP and APs) go through the tick bridge.
+     * The bridge dispatches BSP-only work (tick counter, sleep wakeups,
+     * VGA flush, load balancing) and AP-only work (sched_ap_tick),
+     * then calls schedule() on all CPUs. */
      if (g_tick_cb) g_tick_cb();
  }
  
diff --git a/src/kernel/scheduler.c b/src/kernel/scheduler.c

index d03a75506c842c91f4141401b6b6578e1916953e..f2690a991db9be01eb7b9052c532fc8e6660013b 100644 (file)
--- a/src/kernel/scheduler.c
+++ b/src/kernel/scheduler.c
@@ -538,6 +538,7 @@ void process_exit_notify(int status) {
          }
      }
  
+    uint32_t wake_cpu = (uint32_t)-1;
      if (current_process->pid != 0) {
          struct process* parent = process_find_locked(current_process->parent_pid);
          if (parent && parent->state == PROCESS_BLOCKED && parent->waiting) {
@@ -545,12 +546,16 @@ void process_exit_notify(int status) {
                  parent->wait_result_pid = (int)current_process->pid;
                  parent->wait_result_status = status;
                  parent->state = PROCESS_READY;
-                rq_enqueue(pcpu_rq[parent->cpu_id].active, parent);
+                uint32_t pcpu = parent->cpu_id < SCHED_MAX_CPUS ? parent->cpu_id : 0;
+                rq_enqueue(pcpu_rq[pcpu].active, parent);
+                sched_pcpu_inc_load(pcpu);
+                wake_cpu = pcpu;
              }
          }
      }
  
      spin_unlock_irqrestore(&sched_lock, flags);
+    if (wake_cpu != (uint32_t)-1) sched_ipi_resched(wake_cpu);
  }
  
  static void fork_child_trampoline(void) {
@@ -1188,14 +1193,19 @@ void process_wake_check(uint32_t current_tick) {
          current_process->utime++;
      }
  
-    /* O(1) sleep queue: pop expired entries from the sorted head */
+    /* O(1) sleep queue: pop expired entries from the sorted head.
+     * Track which remote CPUs need an IPI to pick up newly-ready work. */
+    uint32_t ipi_mask = 0;  /* bitmask of CPUs needing IPI */
      while (sleep_head && current_tick >= sleep_head->wake_at_tick) {
          struct process* p = sleep_head;
          sleep_queue_remove(p);
          if (p->state == PROCESS_SLEEPING) {
              p->state = PROCESS_READY;
              if (p->priority > 0) p->priority--;
-            rq_enqueue(pcpu_rq[p->cpu_id].active, p);
+            uint32_t tcpu = p->cpu_id < SCHED_MAX_CPUS ? p->cpu_id : 0;
+            rq_enqueue(pcpu_rq[tcpu].active, p);
+            sched_pcpu_inc_load(tcpu);
+            if (tcpu < 32) ipi_mask |= (1U << tcpu);
          }
      }
  
@@ -1233,6 +1243,100 @@ void process_wake_check(uint32_t current_tick) {
      }
  
      spin_unlock_irqrestore(&sched_lock, flags);
+
+    /* Send IPI to remote CPUs that received newly-ready processes */
+    uint32_t my_cpu = percpu_cpu_index();
+    ipi_mask &= ~(1U << my_cpu);  /* no self-IPI */
+    while (ipi_mask) {
+        uint32_t c = (uint32_t)__builtin_ctz(ipi_mask);
+        sched_ipi_resched(c);
+        ipi_mask &= ~(1U << c);
+    }
+}
+
+void sched_ap_tick(void) {
+    /* Called from AP timer interrupt — per-CPU accounting.
+     * Sleep/alarm queues and global tick are managed by BSP. */
+    uintptr_t flags = spin_lock_irqsave(&sched_lock);
+
+    if (current_process && current_process->state == PROCESS_RUNNING) {
+        current_process->utime++;
+
+        /* ITIMER_VIRTUAL: decrement when running in user mode */
+        if (current_process->itimer_virt_value > 0) {
+            current_process->itimer_virt_value--;
+            if (current_process->itimer_virt_value == 0) {
+                current_process->sig_pending_mask |= (1U << 26); /* SIGVTALRM */
+                current_process->itimer_virt_value = current_process->itimer_virt_interval;
+            }
+        }
+        /* ITIMER_PROF: decrement when running (user + kernel) */
+        if (current_process->itimer_prof_value > 0) {
+            current_process->itimer_prof_value--;
+            if (current_process->itimer_prof_value == 0) {
+                current_process->sig_pending_mask |= (1U << 27); /* SIGPROF */
+                current_process->itimer_prof_value = current_process->itimer_prof_interval;
+            }
+        }
+    }
+
+    spin_unlock_irqrestore(&sched_lock, flags);
+}
+
+void sched_load_balance(void) {
+    /* Periodic work stealing: called from BSP timer tick.
+     * Migrates one process from the busiest CPU to the idlest CPU
+     * if the load imbalance exceeds a threshold. */
+    uint32_t ncpus = sched_pcpu_count();
+    if (ncpus <= 1) return;
+
+    uint32_t max_cpu = 0, min_cpu = 0;
+    uint32_t max_load = 0, min_load = (uint32_t)-1;
+
+    for (uint32_t i = 0; i < ncpus && i < SCHED_MAX_CPUS; i++) {
+        uint32_t load = sched_pcpu_get_load(i);
+        if (load > max_load) { max_load = load; max_cpu = i; }
+        if (load < min_load) { min_load = load; min_cpu = i; }
+    }
+
+    /* Only migrate if imbalance >= 2 (avoids ping-pong) */
+    if (max_cpu == min_cpu || max_load < min_load + 2) return;
+
+    uintptr_t flags = spin_lock_irqsave(&sched_lock);
+
+    struct cpu_rq *src = &pcpu_rq[max_cpu];
+
+    /* Find a migratable process in the source's expired queue first,
+     * then active.  Skip idle processes and the currently running process. */
+    struct process* victim = NULL;
+    for (int pass = 0; pass < 2 && !victim; pass++) {
+        struct runqueue* rq = (pass == 0) ? src->expired : src->active;
+        if (!rq->bitmap) continue;
+        /* Try lowest-priority queue first (least disruptive) */
+        for (int prio = SCHED_NUM_PRIOS - 1; prio >= 0 && !victim; prio--) {
+            if (!(rq->bitmap & (1U << prio))) continue;
+            struct process* p = rq->queue[prio].head;
+            while (p) {
+                if (p != src->idle && p->state == PROCESS_READY) {
+                    victim = p;
+                    rq_dequeue(rq, p);
+                    break;
+                }
+                p = p->rq_next;
+            }
+        }
+    }
+
+    if (victim) {
+        victim->cpu_id = min_cpu;
+        rq_enqueue(pcpu_rq[min_cpu].active, victim);
+        sched_pcpu_dec_load(max_cpu);
+        sched_pcpu_inc_load(min_cpu);
+    }
+
+    spin_unlock_irqrestore(&sched_lock, flags);
+
+    if (victim) sched_ipi_resched(min_cpu);
  }
  
  uint32_t process_alarm_set(struct process* p, uint32_t tick) {
diff --git a/tests/smoke_test.exp b/tests/smoke_test.exp

index 9239254fa9a074ab3f619be0e798087a3b18ccf4..4c7551c42504fe3e65b9ceb32887510e0ad37a17 100755 (executable)
--- a/tests/smoke_test.exp
+++ b/tests/smoke_test.exp
@@ -143,6 +143,7 @@ set tests {
      {"mprotect"            "\\[init\\] mprotect OK"}
      {"getrlimit/setrlimit" "\\[init\\] getrlimit/setrlimit OK"}
      {"uname"               "\\[init\\] uname OK"}
+    {"SMP parallel fork"   "\\[init\\] SMP parallel fork OK"}
      {"LZ4 Frame decomp"    "\\[INITRD\\] LZ4"}
  }
  
diff --git a/user/fulltest.c b/user/fulltest.c

index 4b059558c456ecaeaa3497f2363bf8a09da4605e..4ee9f0fe7717ddd7b7a8258bb3a40613e8d8a986 100644 (file)
--- a/user/fulltest.c
+++ b/user/fulltest.c
@@ -4396,6 +4396,43 @@ void _start(void) {
          sys_write(1, "[init] uname OK\n", (uint32_t)(sizeof("[init] uname OK\n") - 1));
      }
  
+    // H1: SMP parallel fork test — exercises multi-CPU scheduling + load balancing
+    {
+        #define SMP_NCHILD 8
+        int smp_pids[SMP_NCHILD];
+        int smp_ok = 1;
+
+        for (int i = 0; i < SMP_NCHILD; i++) {
+            int pid = sys_fork();
+            if (pid == 0) {
+                /* Child: busy loop to consume a time slice, then exit with index */
+                volatile uint32_t sum = 0;
+                for (uint32_t j = 0; j < 50000; j++) sum += j;
+                (void)sum;
+                sys_exit(i + 1);
+            }
+            smp_pids[i] = pid;
+        }
+
+        /* Parent: wait for all children, verify each returned correct status */
+        for (int i = 0; i < SMP_NCHILD; i++) {
+            int st = 0;
+            int wp = sys_waitpid(smp_pids[i], &st, 0);
+            if (wp != smp_pids[i] || st != (i + 1)) {
+                smp_ok = 0;
+            }
+        }
+
+        if (smp_ok) {
+            static const char msg[] = "[init] SMP parallel fork OK\n";
+            (void)sys_write(1, msg, (uint32_t)(sizeof(msg) - 1));
+        } else {
+            static const char msg[] = "[init] SMP parallel fork FAIL\n";
+            (void)sys_write(1, msg, (uint32_t)(sizeof(msg) - 1));
+        }
+        #undef SMP_NCHILD
+    }
+
      (void)sys_write(1, "[init] execve(/bin/echo)\n",
                      (uint32_t)(sizeof("[init] execve(/bin/echo)\n") - 1));
      static const char* const argv[] = {"echo", "[echo]", "hello", "from", "echo", 0};
author	Tulio A M Mendes <[email protected]>
	Sat, 14 Mar 2026 13:23:54 +0000 (10:23 -0300)
committer	Tulio A M Mendes <[email protected]>
	Sat, 14 Mar 2026 13:23:54 +0000 (10:23 -0300)
include/process.h		patch \| blob \| blame \| history
src/drivers/timer.c		patch \| blob \| blame \| history
src/hal/x86/timer.c		patch \| blob \| blame \| history
src/kernel/scheduler.c		patch \| blob \| blame \| history
tests/smoke_test.exp		patch \| blob \| blame \| history
user/fulltest.c		patch \| blob \| blame \| history