From: Tulio A M Mendes Date: Mon, 16 Feb 2026 21:04:47 +0000 (-0300) Subject: feat: SMP load balancing — per-CPU TSS, AP GDT reload, BSP-only timer work X-Git-Url: https://projects.tadryanom.me/sitemap.xml?a=commitdiff_plain;h=a4575de23d375a5fb5363a920afc118d4e20106a;p=AdrOS.git feat: SMP load balancing — per-CPU TSS, AP GDT reload, BSP-only timer work Three fixes enable kernel thread dispatch to any CPU: 1. Per-CPU TSS (gdt.c, gdt.h): Replace single TSS with tss_array[SMP_MAX_CPUS]. Each AP gets its own TSS via tss_init_ap() so ring 3→0 transitions use the correct per-task kernel stack on any CPU. 2. AP GDT virtual base reload (smp.c): The AP trampoline loads the GDT with a physical base for real→protected mode. After paging is active, reload the GDTR with the virtual base and flush all segment registers. Without this, ring transitions on APs read GDT entries from the identity-mapped physical address, causing silent failures for user-mode processes. 3. BSP-only timer work (timer.c): Gate tick increment, vdso update, vga_flush, hal_uart_poll_rx, and process_wake_check to run only on CPU 0. APs only call schedule(). Prevents non-atomic tick races, concurrent VGA/UART access, and duplicate wake processing. 4. Per-CPU SYSENTER stacks (sysenter_init.c): Each AP gets its own SYSENTER ESP MSR pointing to a dedicated stack. 5. Load balancing (scheduler.c): process_create_kernel dispatches to the least-loaded CPU via sched_pcpu_least_loaded(). All CPUs update their own TSS ESP0 during context switch. 83/83 smoke tests pass, cppcheck clean. --- diff --git a/include/arch/x86/gdt.h b/include/arch/x86/gdt.h index c87c22f..20dd18b 100644 --- a/include/arch/x86/gdt.h +++ b/include/arch/x86/gdt.h @@ -13,6 +13,7 @@ extern struct gdt_ptr gp; void gdt_init(void); void tss_set_kernel_stack(uintptr_t esp0); +void tss_init_ap(uint32_t cpu_index); void gdt_set_gate_ext(int num, uint32_t base, uint32_t limit, uint8_t access, uint8_t gran); #endif diff --git a/src/arch/x86/gdt.c b/src/arch/x86/gdt.c index 6d1153b..eddc5b9 100644 --- a/src/arch/x86/gdt.c +++ b/src/arch/x86/gdt.c @@ -1,4 +1,5 @@ #include "arch/x86/gdt.h" +#include "arch/x86/smp.h" #include "console.h" #include "utils.h" @@ -45,11 +46,14 @@ struct tss_entry { extern void gdt_flush(uint32_t gdt_ptr_addr); extern void tss_flush(uint16_t tss_selector); -/* 6 base entries + up to SMP_MAX_CPUS per-CPU GS segments */ -#define GDT_MAX_ENTRIES 24 +/* 6 base + 16 percpu GS + 1 user TLS + 16 per-CPU TSS = 39 max */ +#define GDT_MAX_ENTRIES 40 +/* AP TSS entries start at GDT slot 23 (after user TLS at 22) */ +#define TSS_AP_GDT_BASE 23 + static struct gdt_entry gdt[GDT_MAX_ENTRIES]; struct gdt_ptr gp; -static struct tss_entry tss; +static struct tss_entry tss_array[SMP_MAX_CPUS]; static void gdt_set_gate(int num, uint32_t base, uint32_t limit, uint8_t access, uint8_t gran) { gdt[num].base_low = (base & 0xFFFF); @@ -71,28 +75,46 @@ void gdt_set_gate_ext(int num, uint32_t base, uint32_t limit, uint8_t access, ui __asm__ volatile("lgdt %0" : : "m"(gp)); } -static void tss_write(uint32_t idx, uint16_t kernel_ss, uint32_t kernel_esp) { - uintptr_t base = (uintptr_t)&tss; - uint32_t limit = (uint32_t)(sizeof(tss) - 1); +static void tss_write(uint32_t gdt_idx, uint32_t cpu, uint16_t kernel_ss, uint32_t kernel_esp) { + struct tss_entry* t = &tss_array[cpu]; + uintptr_t base = (uintptr_t)t; + uint32_t limit = (uint32_t)(sizeof(*t) - 1); - gdt_set_gate((int)idx, (uint32_t)base, limit, 0x89, 0x00); + gdt_set_gate((int)gdt_idx, (uint32_t)base, limit, 0x89, 0x00); - for (size_t i = 0; i < sizeof(tss); i++) { - ((uint8_t*)&tss)[i] = 0; + for (size_t i = 0; i < sizeof(*t); i++) { + ((uint8_t*)t)[i] = 0; } - tss.ss0 = kernel_ss; - tss.esp0 = kernel_esp; - tss.iomap_base = (uint16_t)sizeof(tss); + t->ss0 = kernel_ss; + t->esp0 = kernel_esp; + t->iomap_base = (uint16_t)sizeof(*t); } extern void x86_sysenter_set_kernel_stack(uintptr_t esp0); void tss_set_kernel_stack(uintptr_t esp0) { - tss.esp0 = (uint32_t)esp0; + /* Determine which CPU we're on and update that CPU's TSS */ + extern uint32_t lapic_get_id(void); + extern int lapic_is_enabled(void); + uint32_t cpu = 0; + if (lapic_is_enabled()) { + extern uint32_t smp_current_cpu(void); + cpu = smp_current_cpu(); + } + if (cpu >= SMP_MAX_CPUS) cpu = 0; + tss_array[cpu].esp0 = (uint32_t)esp0; x86_sysenter_set_kernel_stack(esp0); } +void tss_init_ap(uint32_t cpu_index) { + if (cpu_index == 0 || cpu_index >= SMP_MAX_CPUS) return; + uint32_t gdt_idx = TSS_AP_GDT_BASE + (cpu_index - 1); + tss_write(gdt_idx, cpu_index, 0x10, 0); + uint16_t sel = (uint16_t)(gdt_idx * 8); + tss_flush(sel); +} + void gdt_init(void) { kprintf("[GDT] Initializing GDT/TSS...\n"); @@ -107,7 +129,7 @@ void gdt_init(void) { gdt_set_gate(3, 0, 0xFFFFFFFF, 0xFA, 0xCF); gdt_set_gate(4, 0, 0xFFFFFFFF, 0xF2, 0xCF); - tss_write(5, 0x10, 0); + tss_write(5, 0, 0x10, 0); gdt_flush((uint32_t)(uintptr_t)&gp); tss_flush(0x28); diff --git a/src/arch/x86/smp.c b/src/arch/x86/smp.c index e6db605..73abae6 100644 --- a/src/arch/x86/smp.c +++ b/src/arch/x86/smp.c @@ -56,6 +56,25 @@ static inline uint32_t read_cr3(void) { /* Called by each AP after it enters protected mode + paging. * This runs on the AP's own stack. */ void ap_entry(void) { + /* Reload the GDT with virtual base address. The trampoline loaded + * the GDT using a physical base for the real→protected mode transition. + * Now that paging is active we must switch to the virtual base so + * segment loads, LTR, and ring transitions read the correct GDT. */ + extern struct gdt_ptr gp; + __asm__ volatile("lgdt %0" : : "m"(gp)); + + /* Reload segment registers with the virtual-base GDT */ + __asm__ volatile( + "mov $0x10, %%ax\n\t" + "mov %%ax, %%ds\n\t" + "mov %%ax, %%es\n\t" + "mov %%ax, %%fs\n\t" + "mov %%ax, %%ss\n\t" + "ljmp $0x08, $1f\n\t" + "1:\n\t" + ::: "eax", "memory" + ); + /* Load the IDT on this AP (BSP already initialized it, APs just need lidt) */ idt_load_ap(); @@ -85,6 +104,13 @@ void ap_entry(void) { } } + /* Set up per-CPU TSS so this AP can handle ring 0↔3 transitions */ + tss_init_ap(my_cpu); + + /* Set up SYSENTER MSRs on this AP (per-CPU MSRs + per-CPU stack) */ + extern void sysenter_init_ap(uint32_t cpu_index); + sysenter_init_ap(my_cpu); + /* Wait for BSP to finish scheduler init (process_init sets PID 0). * We check by waiting for the ap_sched_go flag set by the BSP after * timer_init completes. */ diff --git a/src/arch/x86/sysenter_init.c b/src/arch/x86/sysenter_init.c index f983731..088621b 100644 --- a/src/arch/x86/sysenter_init.c +++ b/src/arch/x86/sysenter_init.c @@ -1,6 +1,7 @@ #include "hal/cpu_features.h" #include "interrupts.h" #include "console.h" +#include "arch/x86/smp.h" #include @@ -23,11 +24,11 @@ static inline uint64_t rdmsr(uint32_t msr) { #define IA32_SYSENTER_ESP 0x175 #define IA32_SYSENTER_EIP 0x176 -/* Fixed kernel stack for SYSENTER entry — used only briefly before +/* Per-CPU kernel stacks for SYSENTER entry — used only briefly before * the handler switches to the per-task kernel stack via TSS.ESP0. - * For now, since we're single-core and the handler runs with IRQs - * disabled until it reads the real stack, this is safe. */ -static uint8_t sysenter_stack[4096] __attribute__((aligned(16))); + * Each CPU needs its own stack to avoid corruption when multiple CPUs + * enter SYSENTER simultaneously. */ +static uint8_t sysenter_stacks[SMP_MAX_CPUS][4096] __attribute__((aligned(16))); static int sysenter_enabled = 0; static void x86_sysenter_init(void); @@ -52,8 +53,8 @@ static void x86_sysenter_init(void) { * Our GDT: 0x08=KernelCS, 0x10=KernelSS, 0x18=UserCS, 0x20=UserSS ✓ */ wrmsr(IA32_SYSENTER_CS, 0x08); - /* MSR 0x175: kernel ESP — top of our fixed sysenter stack */ - wrmsr(IA32_SYSENTER_ESP, (uintptr_t)&sysenter_stack[sizeof(sysenter_stack)]); + /* MSR 0x175: kernel ESP — top of BSP's sysenter stack */ + wrmsr(IA32_SYSENTER_ESP, (uintptr_t)&sysenter_stacks[0][4096]); /* MSR 0x176: kernel EIP — our assembly entry point */ wrmsr(IA32_SYSENTER_EIP, (uintptr_t)sysenter_entry); @@ -67,3 +68,11 @@ void x86_sysenter_set_kernel_stack(uintptr_t esp0) { wrmsr(IA32_SYSENTER_ESP, (uint64_t)esp0); } } + +void sysenter_init_ap(uint32_t cpu_index) { + if (!sysenter_enabled) return; + if (cpu_index >= SMP_MAX_CPUS) return; + wrmsr(IA32_SYSENTER_CS, 0x08); + wrmsr(IA32_SYSENTER_ESP, (uintptr_t)&sysenter_stacks[cpu_index][4096]); + wrmsr(IA32_SYSENTER_EIP, (uintptr_t)sysenter_entry); +} diff --git a/src/drivers/timer.c b/src/drivers/timer.c index a420bdd..1ee47a5 100644 --- a/src/drivers/timer.c +++ b/src/drivers/timer.c @@ -51,11 +51,23 @@ uint64_t clock_gettime_ns(void) { } static void hal_tick_bridge(void) { - tick++; - vdso_update_tick(tick); - vga_flush(); - hal_uart_poll_rx(); - process_wake_check(tick); +#ifdef __i386__ + extern uint32_t smp_current_cpu(void); + uint32_t cpu = smp_current_cpu(); +#else + uint32_t cpu = 0; +#endif + + if (cpu == 0) { + /* BSP: maintain tick counter, wake sleepers, flush display */ + tick++; + vdso_update_tick(tick); + vga_flush(); + hal_uart_poll_rx(); + process_wake_check(tick); + } + + /* All CPUs: run the scheduler to pick up new work */ schedule(); } diff --git a/src/kernel/scheduler.c b/src/kernel/scheduler.c index 6df95c0..b5197f9 100644 --- a/src/kernel/scheduler.c +++ b/src/kernel/scheduler.c @@ -937,7 +937,8 @@ struct process* process_create_kernel(void (*entry_point)(void)) { proc->flags = 0; proc->tls_base = 0; proc->clear_child_tid = NULL; - proc->cpu_id = 0; + uint32_t target = sched_pcpu_least_loaded(); + proc->cpu_id = target; arch_fpu_init_state(proc->fpu_state); @@ -966,10 +967,11 @@ struct process* process_create_kernel(void (*entry_point)(void)) { ready_queue_head->prev = proc; ready_queue_tail = proc; - rq_enqueue(pcpu_rq[0].active, proc); - sched_pcpu_inc_load(0); + rq_enqueue(pcpu_rq[target].active, proc); + sched_pcpu_inc_load(target); spin_unlock_irqrestore(&sched_lock, flags); + /* IPI disabled for testing */ return proc; } @@ -1057,9 +1059,9 @@ void schedule(void) { hal_cpu_set_address_space(current_process->addr_space); } - /* Only update TSS kernel stack on CPU 0 — the TSS is shared and - * only the BSP runs user processes that need ring 0 stack in TSS. */ - if (cpu == 0 && current_process->kernel_stack) { + /* Update this CPU's TSS kernel stack so ring 3→0 transitions + * use the correct per-task kernel stack. Each CPU has its own TSS. */ + if (current_process->kernel_stack) { hal_cpu_set_kernel_stack((uintptr_t)current_process->kernel_stack + KSTACK_SIZE); }