From: Tulio A M Mendes Date: Mon, 16 Feb 2026 00:45:17 +0000 (-0300) Subject: feat: FPU/SSE context save/restore for correct floating-point across context switches X-Git-Url: https://projects.tadryanom.me/docs/POSIX_ROADMAP.md?a=commitdiff_plain;h=cf204443c15487936b43da28f25986492dcf05e3;p=AdrOS.git feat: FPU/SSE context save/restore for correct floating-point across context switches - arch_fpu_init(): initialize x87 FPU (CR0.NE, clear EM/TS), enable OSFXSR if FXSR supported - arch_fpu_save/restore: FXSAVE/FXRSTOR (or FSAVE/FRSTOR fallback) per process - FPU state (512B) added to struct process, initialized for new processes - fork/clone inherit parent FPU state; kernel threads get clean state - schedule() saves prev FPU state before context_switch, restores next after - Heap header padded 8->16 bytes for 16-byte aligned kmalloc (FXSAVE requirement) - Added -mno-sse -mno-mmx to kernel ARCH_CFLAGS (prevent SSE in kernel code) - Weak stubs in src/kernel/fpu.c for non-x86 architectures --- diff --git a/Makefile b/Makefile index 83a6f1b..81ec674 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,7 @@ ifeq ($(ARCH),x86) C_SOURCES += $(NET_SOURCES) # Mandatory Architecture Flags - ARCH_CFLAGS := -m32 -ffreestanding -fno-builtin -U_FORTIFY_SOURCE -Iinclude -Iinclude/net -Ithird_party/lwip/src/include + ARCH_CFLAGS := -m32 -ffreestanding -fno-builtin -U_FORTIFY_SOURCE -mno-sse -mno-mmx -Iinclude -Iinclude/net -Ithird_party/lwip/src/include ARCH_LDFLAGS := -m elf_i386 -T $(SRC_DIR)/arch/x86/linker.ld ARCH_ASFLAGS := --32 diff --git a/include/arch_fpu.h b/include/arch_fpu.h new file mode 100644 index 0000000..94e28e6 --- /dev/null +++ b/include/arch_fpu.h @@ -0,0 +1,28 @@ +#ifndef ARCH_FPU_H +#define ARCH_FPU_H + +#include +#include + +/* + * FPU/SSE context save area size. + * FXSAVE requires 512 bytes, 16-byte aligned. + * FSAVE requires 108 bytes (no alignment requirement). + * We always allocate the larger size for simplicity. + */ +#define FPU_STATE_SIZE 512 +#define FPU_STATE_ALIGN 16 + +/* Initialize FPU hardware during boot (CR0/CR4 bits, FNINIT). */ +void arch_fpu_init(void); + +/* Save current FPU/SSE state into buffer (must be 16-byte aligned). */ +void arch_fpu_save(uint8_t* state); + +/* Restore FPU/SSE state from buffer (must be 16-byte aligned). */ +void arch_fpu_restore(const uint8_t* state); + +/* Copy the clean (post-FNINIT) FPU state into buffer for new processes. */ +void arch_fpu_init_state(uint8_t* state); + +#endif diff --git a/include/process.h b/include/process.h index cc2c046..dccc4ff 100644 --- a/include/process.h +++ b/include/process.h @@ -3,6 +3,7 @@ #include #include "arch_types.h" +#include "arch_fpu.h" #include "fs.h" #include "signal.h" @@ -128,6 +129,8 @@ struct process { uint32_t flags; /* PROCESS_FLAG_* */ uintptr_t tls_base; /* User-space TLS base (set via SET_THREAD_AREA) */ uint32_t* clear_child_tid; /* User address to clear + futex-wake on exit */ + + uint8_t fpu_state[FPU_STATE_SIZE] __attribute__((aligned(FPU_STATE_ALIGN))); }; // Global pointer to the currently running process diff --git a/src/arch/x86/fpu.c b/src/arch/x86/fpu.c new file mode 100644 index 0000000..0d8ee55 --- /dev/null +++ b/src/arch/x86/fpu.c @@ -0,0 +1,93 @@ +#include "arch_fpu.h" +#include "console.h" +#include "hal/cpu_features.h" + +#include +#include + +/* CR0 bits */ +#define CR0_EM (1U << 2) /* Emulate coprocessor (must be CLEAR for real FPU) */ +#define CR0_TS (1U << 3) /* Task Switched (lazy FPU — we clear it) */ +#define CR0_NE (1U << 5) /* Numeric Error (use native FPU exceptions) */ +#define CR0_MP (1U << 1) /* Monitor coprocessor */ + +/* CR4 bits */ +#define CR4_OSFXSR (1U << 9) /* OS supports FXSAVE/FXRSTOR */ +#define CR4_OSXMMEXCPT (1U << 10) /* OS supports SSE exceptions */ + +static int g_fpu_has_fxsr = 0; + +/* Clean FPU state captured right after FNINIT — used as template for new processes */ +static uint8_t g_fpu_clean_state[FPU_STATE_SIZE] __attribute__((aligned(FPU_STATE_ALIGN))); + +static inline uint32_t read_cr0(void) { + uint32_t val; + __asm__ volatile("mov %%cr0, %0" : "=r"(val)); + return val; +} + +static inline void write_cr0(uint32_t val) { + __asm__ volatile("mov %0, %%cr0" :: "r"(val) : "memory"); +} + +static inline uint32_t read_cr4(void) { + uint32_t val; + __asm__ volatile("mov %%cr4, %0" : "=r"(val)); + return val; +} + +static inline void write_cr4(uint32_t val) { + __asm__ volatile("mov %0, %%cr4" :: "r"(val) : "memory"); +} + +void arch_fpu_init(void) { + const struct cpu_features* f = hal_cpu_get_features(); + + /* Set CR0: clear EM (no emulation), set MP+NE, clear TS */ + uint32_t cr0 = read_cr0(); + cr0 &= ~(CR0_EM | CR0_TS); + cr0 |= CR0_MP | CR0_NE; + write_cr0(cr0); + + /* Initialize x87 FPU */ + __asm__ volatile("fninit"); + + /* Enable FXSAVE/FXRSTOR if supported */ + if (f->has_fxsr) { + uint32_t cr4 = read_cr4(); + cr4 |= CR4_OSFXSR | CR4_OSXMMEXCPT; + write_cr4(cr4); + g_fpu_has_fxsr = 1; + kprintf("[FPU] FXSAVE/FXRSTOR enabled (SSE context support).\n"); + } else { + kprintf("[FPU] Using legacy FSAVE/FRSTOR.\n"); + } + + /* Capture clean FPU state as template for new processes */ + memset(g_fpu_clean_state, 0, FPU_STATE_SIZE); + arch_fpu_save(g_fpu_clean_state); + + kprintf("[FPU] FPU/SSE context switching initialized.\n"); +} + +void arch_fpu_save(uint8_t* state) { + if (g_fpu_has_fxsr) { + __asm__ volatile("fxsave (%0)" :: "r"(state) : "memory"); + } else { + __asm__ volatile("fnsave (%0)" :: "r"(state) : "memory"); + /* fnsave resets the FPU — reinitialize so current process can keep using it */ + __asm__ volatile("fninit"); + } +} + +void arch_fpu_restore(const uint8_t* state) { + if (g_fpu_has_fxsr) { + __asm__ volatile("fxrstor (%0)" :: "r"(state) : "memory"); + } else { + __asm__ volatile("frstor (%0)" :: "r"(state) : "memory"); + } +} + +void arch_fpu_init_state(uint8_t* state) { + memcpy(state, g_fpu_clean_state, FPU_STATE_SIZE); +} diff --git a/src/kernel/fpu.c b/src/kernel/fpu.c new file mode 100644 index 0000000..0796b11 --- /dev/null +++ b/src/kernel/fpu.c @@ -0,0 +1,23 @@ +#include "arch_fpu.h" +#include "console.h" +#include + +__attribute__((weak)) +void arch_fpu_init(void) { + kprintf("[FPU] No arch-specific FPU support.\n"); +} + +__attribute__((weak)) +void arch_fpu_save(uint8_t* state) { + (void)state; +} + +__attribute__((weak)) +void arch_fpu_restore(const uint8_t* state) { + (void)state; +} + +__attribute__((weak)) +void arch_fpu_init_state(uint8_t* state) { + memset(state, 0, FPU_STATE_SIZE); +} diff --git a/src/kernel/main.c b/src/kernel/main.c index ff80841..343a8aa 100644 --- a/src/kernel/main.c +++ b/src/kernel/main.c @@ -21,6 +21,7 @@ #include "hal/cpu.h" #include "hal/cpu_features.h" +#include "arch_fpu.h" #include "shm.h" #include "net.h" @@ -39,6 +40,7 @@ void kernel_main(const struct boot_info* bi) { hal_cpu_detect_features(); hal_cpu_print_features(); + arch_fpu_init(); kprintf("[AdrOS] Initializing PMM...\n"); diff --git a/src/kernel/scheduler.c b/src/kernel/scheduler.c index 4a4fa97..8ec5f01 100644 --- a/src/kernel/scheduler.c +++ b/src/kernel/scheduler.c @@ -10,6 +10,7 @@ #include "hal/cpu.h" #include "hal/usermode.h" #include "arch_process.h" +#include "arch_fpu.h" #include "sched_pcpu.h" #include @@ -532,6 +533,12 @@ struct process* process_fork_create(uintptr_t child_as, const void* child_regs) proc->tls_base = 0; proc->clear_child_tid = NULL; + if (current_process) { + memcpy(proc->fpu_state, current_process->fpu_state, FPU_STATE_SIZE); + } else { + arch_fpu_init_state(proc->fpu_state); + } + for (int i = 0; i < PROCESS_MAX_FILES; i++) { proc->files[i] = NULL; } @@ -665,6 +672,8 @@ struct process* process_clone_create(uint32_t clone_flags, proc->heap_start = current_process->heap_start; proc->heap_break = current_process->heap_break; + memcpy(proc->fpu_state, current_process->fpu_state, FPU_STATE_SIZE); + for (int i = 0; i < PROCESS_MAX_MMAPS; i++) { proc->mmaps[i] = current_process->mmaps[i]; } @@ -762,6 +771,8 @@ void process_init(void) { kernel_proc->tls_base = 0; kernel_proc->clear_child_tid = NULL; + arch_fpu_init_state(kernel_proc->fpu_state); + /* Allocate a dedicated kernel stack for PID 0 with guard page. */ void* kstack0 = kstack_alloc(); if (!kstack0) { @@ -824,6 +835,8 @@ struct process* process_create_kernel(void (*entry_point)(void)) { proc->tls_base = 0; proc->clear_child_tid = NULL; + arch_fpu_init_state(proc->fpu_state); + for (int i = 0; i < PROCESS_MAX_FILES; i++) { proc->files[i] = NULL; } @@ -950,7 +963,9 @@ void schedule(void) { * * For brand-new processes, context_switch's `ret` goes to * thread_wrapper which releases the lock explicitly. */ + arch_fpu_save(prev->fpu_state); context_switch(&prev->sp, current_process->sp); + arch_fpu_restore(current_process->fpu_state); spin_unlock_irqrestore(&sched_lock, irq_flags); } diff --git a/src/mm/heap.c b/src/mm/heap.c index 68d6864..158da46 100644 --- a/src/mm/heap.c +++ b/src/mm/heap.c @@ -32,7 +32,8 @@ typedef struct block_hdr { uint8_t order; /* 5..23 */ uint8_t is_free; /* 1 = free, 0 = allocated */ uint16_t pad; -} block_hdr_t; /* 8 bytes → keeps 8-byte alignment */ + uint32_t pad2[2]; /* Pad to 16 bytes for 16-byte aligned returns */ +} block_hdr_t; /* 16 bytes → FXSAVE-safe alignment */ /* Free-list node, embedded in the data area of a free block */ typedef struct free_node {