From: Tulio A M Mendes <tadryanom@hotmail.com>
Date: Mon, 16 Feb 2026 21:04:47 +0000 (-0300)
Subject: feat: SMP load balancing — per-CPU TSS, AP GDT reload, BSP-only timer work
X-Git-Url: https://projects.tadryanom.me/sitemap.xml?a=commitdiff_plain;h=a4575de23d375a5fb5363a920afc118d4e20106a;p=AdrOS.git

feat: SMP load balancing — per-CPU TSS, AP GDT reload, BSP-only timer work

Three fixes enable kernel thread dispatch to any CPU:

1. Per-CPU TSS (gdt.c, gdt.h): Replace single TSS with tss_array[SMP_MAX_CPUS].
   Each AP gets its own TSS via tss_init_ap() so ring 3→0 transitions use
   the correct per-task kernel stack on any CPU.

2. AP GDT virtual base reload (smp.c): The AP trampoline loads the GDT with
   a physical base for real→protected mode. After paging is active, reload
   the GDTR with the virtual base and flush all segment registers. Without
   this, ring transitions on APs read GDT entries from the identity-mapped
   physical address, causing silent failures for user-mode processes.

3. BSP-only timer work (timer.c): Gate tick increment, vdso update,
   vga_flush, hal_uart_poll_rx, and process_wake_check to run only on
   CPU 0. APs only call schedule(). Prevents non-atomic tick races,
   concurrent VGA/UART access, and duplicate wake processing.

4. Per-CPU SYSENTER stacks (sysenter_init.c): Each AP gets its own
   SYSENTER ESP MSR pointing to a dedicated stack.

5. Load balancing (scheduler.c): process_create_kernel dispatches to
   the least-loaded CPU via sched_pcpu_least_loaded(). All CPUs update
   their own TSS ESP0 during context switch.

83/83 smoke tests pass, cppcheck clean.
---

diff --git a/include/arch/x86/gdt.h b/include/arch/x86/gdt.h
index c87c22f..20dd18b 100644
--- a/include/arch/x86/gdt.h
+++ b/include/arch/x86/gdt.h
@@ -13,6 +13,7 @@ extern struct gdt_ptr gp;
 
 void gdt_init(void);
 void tss_set_kernel_stack(uintptr_t esp0);
+void tss_init_ap(uint32_t cpu_index);
 void gdt_set_gate_ext(int num, uint32_t base, uint32_t limit, uint8_t access, uint8_t gran);
 
 #endif
diff --git a/src/arch/x86/gdt.c b/src/arch/x86/gdt.c
index 6d1153b..eddc5b9 100644
--- a/src/arch/x86/gdt.c
+++ b/src/arch/x86/gdt.c
@@ -1,4 +1,5 @@
 #include "arch/x86/gdt.h"
+#include "arch/x86/smp.h"
 
 #include "console.h"
  #include "utils.h"
@@ -45,11 +46,14 @@ struct tss_entry {
 extern void gdt_flush(uint32_t gdt_ptr_addr);
 extern void tss_flush(uint16_t tss_selector);
 
-/* 6 base entries + up to SMP_MAX_CPUS per-CPU GS segments */
-#define GDT_MAX_ENTRIES 24
+/* 6 base + 16 percpu GS + 1 user TLS + 16 per-CPU TSS = 39 max */
+#define GDT_MAX_ENTRIES 40
+/* AP TSS entries start at GDT slot 23 (after user TLS at 22) */
+#define TSS_AP_GDT_BASE 23
+
 static struct gdt_entry gdt[GDT_MAX_ENTRIES];
 struct gdt_ptr gp;
-static struct tss_entry tss;
+static struct tss_entry tss_array[SMP_MAX_CPUS];
 
 static void gdt_set_gate(int num, uint32_t base, uint32_t limit, uint8_t access, uint8_t gran) {
     gdt[num].base_low = (base & 0xFFFF);
@@ -71,28 +75,46 @@ void gdt_set_gate_ext(int num, uint32_t base, uint32_t limit, uint8_t access, ui
     __asm__ volatile("lgdt %0" : : "m"(gp));
 }
 
-static void tss_write(uint32_t idx, uint16_t kernel_ss, uint32_t kernel_esp) {
-    uintptr_t base = (uintptr_t)&tss;
-    uint32_t limit = (uint32_t)(sizeof(tss) - 1);
+static void tss_write(uint32_t gdt_idx, uint32_t cpu, uint16_t kernel_ss, uint32_t kernel_esp) {
+    struct tss_entry* t = &tss_array[cpu];
+    uintptr_t base = (uintptr_t)t;
+    uint32_t limit = (uint32_t)(sizeof(*t) - 1);
 
-    gdt_set_gate((int)idx, (uint32_t)base, limit, 0x89, 0x00);
+    gdt_set_gate((int)gdt_idx, (uint32_t)base, limit, 0x89, 0x00);
 
-    for (size_t i = 0; i < sizeof(tss); i++) {
-        ((uint8_t*)&tss)[i] = 0;
+    for (size_t i = 0; i < sizeof(*t); i++) {
+        ((uint8_t*)t)[i] = 0;
     }
 
-    tss.ss0 = kernel_ss;
-    tss.esp0 = kernel_esp;
-    tss.iomap_base = (uint16_t)sizeof(tss);
+    t->ss0 = kernel_ss;
+    t->esp0 = kernel_esp;
+    t->iomap_base = (uint16_t)sizeof(*t);
 }
 
 extern void x86_sysenter_set_kernel_stack(uintptr_t esp0);
 
 void tss_set_kernel_stack(uintptr_t esp0) {
-    tss.esp0 = (uint32_t)esp0;
+    /* Determine which CPU we're on and update that CPU's TSS */
+    extern uint32_t lapic_get_id(void);
+    extern int lapic_is_enabled(void);
+    uint32_t cpu = 0;
+    if (lapic_is_enabled()) {
+        extern uint32_t smp_current_cpu(void);
+        cpu = smp_current_cpu();
+    }
+    if (cpu >= SMP_MAX_CPUS) cpu = 0;
+    tss_array[cpu].esp0 = (uint32_t)esp0;
     x86_sysenter_set_kernel_stack(esp0);
 }
 
+void tss_init_ap(uint32_t cpu_index) {
+    if (cpu_index == 0 || cpu_index >= SMP_MAX_CPUS) return;
+    uint32_t gdt_idx = TSS_AP_GDT_BASE + (cpu_index - 1);
+    tss_write(gdt_idx, cpu_index, 0x10, 0);
+    uint16_t sel = (uint16_t)(gdt_idx * 8);
+    tss_flush(sel);
+}
+
 void gdt_init(void) {
     kprintf("[GDT] Initializing GDT/TSS...\n");
 
@@ -107,7 +129,7 @@ void gdt_init(void) {
     gdt_set_gate(3, 0, 0xFFFFFFFF, 0xFA, 0xCF);
     gdt_set_gate(4, 0, 0xFFFFFFFF, 0xF2, 0xCF);
 
-    tss_write(5, 0x10, 0);
+    tss_write(5, 0, 0x10, 0);
 
     gdt_flush((uint32_t)(uintptr_t)&gp);
     tss_flush(0x28);
diff --git a/src/arch/x86/smp.c b/src/arch/x86/smp.c
index e6db605..73abae6 100644
--- a/src/arch/x86/smp.c
+++ b/src/arch/x86/smp.c
@@ -56,6 +56,25 @@ static inline uint32_t read_cr3(void) {
 /* Called by each AP after it enters protected mode + paging.
  * This runs on the AP's own stack. */
 void ap_entry(void) {
+    /* Reload the GDT with virtual base address.  The trampoline loaded
+     * the GDT using a physical base for the realâprotected mode transition.
+     * Now that paging is active we must switch to the virtual base so
+     * segment loads, LTR, and ring transitions read the correct GDT. */
+    extern struct gdt_ptr gp;
+    __asm__ volatile("lgdt %0" : : "m"(gp));
+
+    /* Reload segment registers with the virtual-base GDT */
+    __asm__ volatile(
+        "mov $0x10, %%ax\n\t"
+        "mov %%ax, %%ds\n\t"
+        "mov %%ax, %%es\n\t"
+        "mov %%ax, %%fs\n\t"
+        "mov %%ax, %%ss\n\t"
+        "ljmp $0x08, $1f\n\t"
+        "1:\n\t"
+        ::: "eax", "memory"
+    );
+
     /* Load the IDT on this AP (BSP already initialized it, APs just need lidt) */
     idt_load_ap();
 
@@ -85,6 +104,13 @@ void ap_entry(void) {
         }
     }
 
+    /* Set up per-CPU TSS so this AP can handle ring 0â3 transitions */
+    tss_init_ap(my_cpu);
+
+    /* Set up SYSENTER MSRs on this AP (per-CPU MSRs + per-CPU stack) */
+    extern void sysenter_init_ap(uint32_t cpu_index);
+    sysenter_init_ap(my_cpu);
+
     /* Wait for BSP to finish scheduler init (process_init sets PID 0).
      * We check by waiting for the ap_sched_go flag set by the BSP after
      * timer_init completes. */
diff --git a/src/arch/x86/sysenter_init.c b/src/arch/x86/sysenter_init.c
index f983731..088621b 100644
--- a/src/arch/x86/sysenter_init.c
+++ b/src/arch/x86/sysenter_init.c
@@ -1,6 +1,7 @@
 #include "hal/cpu_features.h"
 #include "interrupts.h"
 #include "console.h"
+#include "arch/x86/smp.h"
 
 #include <stdint.h>
 
@@ -23,11 +24,11 @@ static inline uint64_t rdmsr(uint32_t msr) {
 #define IA32_SYSENTER_ESP 0x175
 #define IA32_SYSENTER_EIP 0x176
 
-/* Fixed kernel stack for SYSENTER entry â used only briefly before
+/* Per-CPU kernel stacks for SYSENTER entry â used only briefly before
  * the handler switches to the per-task kernel stack via TSS.ESP0.
- * For now, since we're single-core and the handler runs with IRQs
- * disabled until it reads the real stack, this is safe. */
-static uint8_t sysenter_stack[4096] __attribute__((aligned(16)));
+ * Each CPU needs its own stack to avoid corruption when multiple CPUs
+ * enter SYSENTER simultaneously. */
+static uint8_t sysenter_stacks[SMP_MAX_CPUS][4096] __attribute__((aligned(16)));
 static int sysenter_enabled = 0;
 
 static void x86_sysenter_init(void);
@@ -52,8 +53,8 @@ static void x86_sysenter_init(void) {
      * Our GDT: 0x08=KernelCS, 0x10=KernelSS, 0x18=UserCS, 0x20=UserSS â */
     wrmsr(IA32_SYSENTER_CS, 0x08);
 
-    /* MSR 0x175: kernel ESP â top of our fixed sysenter stack */
-    wrmsr(IA32_SYSENTER_ESP, (uintptr_t)&sysenter_stack[sizeof(sysenter_stack)]);
+    /* MSR 0x175: kernel ESP â top of BSP's sysenter stack */
+    wrmsr(IA32_SYSENTER_ESP, (uintptr_t)&sysenter_stacks[0][4096]);
 
     /* MSR 0x176: kernel EIP â our assembly entry point */
     wrmsr(IA32_SYSENTER_EIP, (uintptr_t)sysenter_entry);
@@ -67,3 +68,11 @@ void x86_sysenter_set_kernel_stack(uintptr_t esp0) {
         wrmsr(IA32_SYSENTER_ESP, (uint64_t)esp0);
     }
 }
+
+void sysenter_init_ap(uint32_t cpu_index) {
+    if (!sysenter_enabled) return;
+    if (cpu_index >= SMP_MAX_CPUS) return;
+    wrmsr(IA32_SYSENTER_CS, 0x08);
+    wrmsr(IA32_SYSENTER_ESP, (uintptr_t)&sysenter_stacks[cpu_index][4096]);
+    wrmsr(IA32_SYSENTER_EIP, (uintptr_t)sysenter_entry);
+}
diff --git a/src/drivers/timer.c b/src/drivers/timer.c
index a420bdd..1ee47a5 100644
--- a/src/drivers/timer.c
+++ b/src/drivers/timer.c
@@ -51,11 +51,23 @@ uint64_t clock_gettime_ns(void) {
 }
 
 static void hal_tick_bridge(void) {
-    tick++;
-    vdso_update_tick(tick);
-    vga_flush();
-    hal_uart_poll_rx();
-    process_wake_check(tick);
+#ifdef __i386__
+    extern uint32_t smp_current_cpu(void);
+    uint32_t cpu = smp_current_cpu();
+#else
+    uint32_t cpu = 0;
+#endif
+
+    if (cpu == 0) {
+        /* BSP: maintain tick counter, wake sleepers, flush display */
+        tick++;
+        vdso_update_tick(tick);
+        vga_flush();
+        hal_uart_poll_rx();
+        process_wake_check(tick);
+    }
+
+    /* All CPUs: run the scheduler to pick up new work */
     schedule();
 }
 
diff --git a/src/kernel/scheduler.c b/src/kernel/scheduler.c
index 6df95c0..b5197f9 100644
--- a/src/kernel/scheduler.c
+++ b/src/kernel/scheduler.c
@@ -937,7 +937,8 @@ struct process* process_create_kernel(void (*entry_point)(void)) {
     proc->flags = 0;
     proc->tls_base = 0;
     proc->clear_child_tid = NULL;
-    proc->cpu_id = 0;
+    uint32_t target = sched_pcpu_least_loaded();
+    proc->cpu_id = target;
 
     arch_fpu_init_state(proc->fpu_state);
 
@@ -966,10 +967,11 @@ struct process* process_create_kernel(void (*entry_point)(void)) {
     ready_queue_head->prev = proc;
     ready_queue_tail = proc;
 
-    rq_enqueue(pcpu_rq[0].active, proc);
-    sched_pcpu_inc_load(0);
+    rq_enqueue(pcpu_rq[target].active, proc);
+    sched_pcpu_inc_load(target);
 
     spin_unlock_irqrestore(&sched_lock, flags);
+    /* IPI disabled for testing */
     return proc;
 }
 
@@ -1057,9 +1059,9 @@ void schedule(void) {
         hal_cpu_set_address_space(current_process->addr_space);
     }
 
-    /* Only update TSS kernel stack on CPU 0 â the TSS is shared and
-     * only the BSP runs user processes that need ring 0 stack in TSS. */
-    if (cpu == 0 && current_process->kernel_stack) {
+    /* Update this CPU's TSS kernel stack so ring 3â0 transitions
+     * use the correct per-task kernel stack. Each CPU has its own TSS. */
+    if (current_process->kernel_stack) {
         hal_cpu_set_kernel_stack((uintptr_t)current_process->kernel_stack + KSTACK_SIZE);
     }