outb %al, $0x21
outb %al, $0xA1
- /*
- * SETUP PAGING (Manually)
- * We need to map:
- * 1. Virt 0x00000000 -> Phys 0x00000000 (Identity, so we don't crash now)
- * 2. Virt 0xC0000000 -> Phys 0x00000000 (Kernel Space)
- */
-
/*
- * Map 0-16MB using 4 page tables.
- * With Multiboot2 info copied in arch_early_setup and initrd mapped via VMM,
- * we only need a small identity window for early bring-up.
+ * SETUP PAE PAGING
+ * PAE uses 3-level page tables:
+ * PDPT (4 entries x 8 bytes = 32 bytes, 32-byte aligned)
+ * PD[0..3] (512 entries x 8 bytes = 4KB each)
+ * PT (512 entries x 8 bytes = 4KB each)
+ *
+ * Address split: bits 31:30 = PDPT, bits 29:21 = PD, bits 20:12 = PT
+ *
+ * We map 0-16MB identity + higher-half using 8 page tables (2MB each).
+ * PD[0] entries 0..7 = identity map (PT boot_pt0..boot_pt7)
+ * PD[3] entries 0..7 = higher-half map (same PTs)
+ * Recursive: PD[3][508..511] point to PD[0..3]
*/
- /* Fill PTs (boot_pt0..boot_pt3) */
- mov $boot_pt0, %edi
- sub $KERNEL_VIRT_BASE, %edi /* Physical address of PT0 */
- xor %ebx, %ebx /* pt_index = 0 */
+ /* --- Zero out all paging structures --- */
+ mov $boot_pdpt, %edi
+ sub $KERNEL_VIRT_BASE, %edi
+ xor %eax, %eax
+ mov $(32 + 4*4096 + 8*4096), %ecx /* PDPT + 4 PDs + 8 PTs in bytes */
+ shr $2, %ecx /* /4 for stosl */
+ rep stosl
+ /* --- Fill page tables (boot_pt0..boot_pt7) with 64-bit PAE entries --- */
+ /* Each PT has 512 entries x 8 bytes. 8 PTs cover 8*512*4KB = 16MB. */
+ mov $boot_pt0, %edi
+ sub $KERNEL_VIRT_BASE, %edi
+ xor %esi, %esi /* physical address = 0 */
+ mov $(512*8), %ecx /* 8 PTs x 512 entries = 4096 entries */
1:
- /* Fill current PT with 0x003 | (pt_index*4MB + i*4KB) */
- mov %ebx, %eax
- shl $22, %eax /* base = pt_index * 4MB */
- mov %eax, %esi
-
- mov $1024, %ecx
-2:
- mov %esi, %edx
- or $3, %edx
- mov %edx, (%edi)
+ mov %esi, %eax
+ or $0x03, %eax /* PRESENT | RW */
+ mov %eax, (%edi) /* low 32 bits */
+ movl $0, 4(%edi) /* high 32 bits = 0 */
add $4096, %esi
- add $4, %edi
- loop 2b
-
- inc %ebx
- cmp $4, %ebx
- jne 1b
+ add $8, %edi
+ dec %ecx
+ jnz 1b
- /* 3. Get Physical Address of Page Directory */
- mov $boot_pd, %edi
+ /* --- Set up PD[0]: identity map entries 0..7 -> boot_pt0..boot_pt7 --- */
+ mov $boot_pd0, %edi
sub $KERNEL_VIRT_BASE, %edi
-
- /* Link PT0..PT3 into PD for both identity and higher-half mapping */
mov $boot_pt0, %edx
- sub $KERNEL_VIRT_BASE, %edx /* pt_phys = physical address of PT0 */
- mov $0, %ebx /* i = 0 */
+ sub $KERNEL_VIRT_BASE, %edx
+ mov $8, %ecx
+2:
+ mov %edx, %eax
+ or $0x03, %eax /* PRESENT | RW */
+ mov %eax, (%edi)
+ movl $0, 4(%edi)
+ add $4096, %edx
+ add $8, %edi
+ dec %ecx
+ jnz 2b
+ /* --- Set up PD[3]: higher-half entries 0..7 -> boot_pt0..boot_pt7 --- */
+ mov $boot_pd3, %edi
+ sub $KERNEL_VIRT_BASE, %edi
+ mov $boot_pt0, %edx
+ sub $KERNEL_VIRT_BASE, %edx
+ mov $8, %ecx
3:
mov %edx, %eax
- or $3, %eax
- mov %eax, (%edi,%ebx,4) /* PD[i] */
- mov %eax, 3072(%edi,%ebx,4) /* PD[768+i] */
+ or $0x03, %eax /* PRESENT | RW */
+ mov %eax, (%edi)
+ movl $0, 4(%edi)
add $4096, %edx
- inc %ebx
- cmp $4, %ebx
- jne 3b
+ add $8, %edi
+ dec %ecx
+ jnz 3b
+
+ /* --- Recursive mapping: PD[3][508..511] -> PD[0..3] --- */
+ mov $boot_pd3, %edi
+ sub $KERNEL_VIRT_BASE, %edi
+ /* entry 508 = offset 508*8 = 4064 */
+ add $4064, %edi
+
+ mov $boot_pd0, %edx
+ sub $KERNEL_VIRT_BASE, %edx
+ mov %edx, %eax
+ or $0x03, %eax
+ mov %eax, (%edi) /* PD[3][508] -> PD[0] */
+ movl $0, 4(%edi)
+
+ mov $boot_pd1, %edx
+ sub $KERNEL_VIRT_BASE, %edx
+ mov %edx, %eax
+ or $0x03, %eax
+ mov %eax, 8(%edi) /* PD[3][509] -> PD[1] */
+ movl $0, 12(%edi)
+
+ mov $boot_pd2, %edx
+ sub $KERNEL_VIRT_BASE, %edx
+ mov %edx, %eax
+ or $0x03, %eax
+ mov %eax, 16(%edi) /* PD[3][510] -> PD[2] */
+ movl $0, 20(%edi)
- /* 6. Recursive Mapping (Optional, good for VMM later) at index 1023 */
- mov $boot_pd, %edx
+ mov $boot_pd3, %edx
sub $KERNEL_VIRT_BASE, %edx
- or $3, %edx
- mov %edx, 4092(%edi)
+ mov %edx, %eax
+ or $0x03, %eax
+ mov %eax, 24(%edi) /* PD[3][511] -> PD[3] */
+ movl $0, 28(%edi)
+
+ /* --- Set up PDPT: 4 entries -> PD[0..3] --- */
+ mov $boot_pdpt, %edi
+ sub $KERNEL_VIRT_BASE, %edi
+
+ mov $boot_pd0, %edx
+ sub $KERNEL_VIRT_BASE, %edx
+ mov %edx, %eax
+ or $0x01, %eax /* PRESENT only (PDPT entries have limited flags) */
+ mov %eax, (%edi)
+ movl $0, 4(%edi)
+
+ mov $boot_pd1, %edx
+ sub $KERNEL_VIRT_BASE, %edx
+ mov %edx, %eax
+ or $0x01, %eax
+ mov %eax, 8(%edi)
+ movl $0, 12(%edi)
- /* 7. Load CR3 */
- mov $boot_pd, %ecx
+ mov $boot_pd2, %edx
+ sub $KERNEL_VIRT_BASE, %edx
+ mov %edx, %eax
+ or $0x01, %eax
+ mov %eax, 16(%edi)
+ movl $0, 20(%edi)
+
+ mov $boot_pd3, %edx
+ sub $KERNEL_VIRT_BASE, %edx
+ mov %edx, %eax
+ or $0x01, %eax
+ mov %eax, 24(%edi)
+ movl $0, 28(%edi)
+
+ /* --- Enable PAE in CR4 --- */
+ mov %cr4, %ecx
+ or $0x20, %ecx /* CR4.PAE = bit 5 */
+ mov %ecx, %cr4
+
+ /* --- Load CR3 with PDPT physical address --- */
+ mov $boot_pdpt, %ecx
sub $KERNEL_VIRT_BASE, %ecx
mov %ecx, %cr3
- /* 8. Enable Paging (Set PG bit in CR0) */
+ /* --- Enable Paging (Set PG bit in CR0) --- */
mov %cr0, %ecx
- or $0x80000000, %ecx /* Bit 31 (PG) e Bit 0 (PE - Protection Enable) */
+ or $0x80000000, %ecx
mov %ecx, %cr0
/*
1: hlt
jmp 1b
-/* Global Paging Structures (Pre-allocated in BSS) */
+/* Global PAE Paging Structures (Pre-allocated in BSS) */
.section .bss
+.align 32
+.global boot_pdpt
+boot_pdpt:
+ .skip 32
+
.align 4096
-.global boot_pd
-boot_pd:
+.global boot_pd0
+boot_pd0:
+ .skip 4096
+.global boot_pd1
+boot_pd1:
.skip 4096
+.global boot_pd2
+boot_pd2:
+ .skip 4096
+.global boot_pd3
+boot_pd3:
+ .skip 4096
+
.global boot_pt0
boot_pt0:
- .skip 4096*4
+ .skip 4096*8
.align 16
.global arch_boot_args
#include "hal/cpu.h"
#include <stddef.h>
-/* Constants */
-#define KERNEL_VIRT_BASE 0xC0000000
+/*
+ * PAE Paging for x86-32.
+ *
+ * 3-level page tables with 64-bit entries:
+ * PDPT: 4 entries x 8 bytes = 32 bytes (in CR3)
+ * PD[0..3]: 512 entries x 8 bytes = 4KB each
+ * PT: 512 entries x 8 bytes = 4KB each
+ *
+ * Virtual address decomposition:
+ * bits 31:30 -> PDPT index (0-3)
+ * bits 29:21 -> PD index (0-511)
+ * bits 20:12 -> PT index (0-511)
+ * bits 11:0 -> page offset
+ *
+ * Recursive mapping (set up in boot.S):
+ * PD[3][508] -> PD[0] PD[3][509] -> PD[1]
+ * PD[3][510] -> PD[2] PD[3][511] -> PD[3]
+ *
+ * Access page table [pdpt_i][pd_i]:
+ * VA = 0xFF800000 + pdpt_i * 0x200000 + pd_i * 0x1000
+ *
+ * Access page directory [pdpt_i]:
+ * VA = 0xFFFFC000 + pdpt_i * 0x1000
+ */
+
+#define KERNEL_VIRT_BASE 0xC0000000U
#define PAGE_SIZE 4096
-/* Macros for address translation */
#define V2P(x) ((uintptr_t)(x) - KERNEL_VIRT_BASE)
#define P2V(x) ((uintptr_t)(x) + KERNEL_VIRT_BASE)
-/* x86 Paging Flags */
-#define X86_PTE_PRESENT 0x1
-#define X86_PTE_RW 0x2
-#define X86_PTE_USER 0x4
-#define X86_PTE_PWT 0x8 /* Page Write-Through */
-#define X86_PTE_PCD 0x10 /* Page Cache Disable */
-#define X86_PTE_COW 0x200 /* Bit 9: OS-available, marks Copy-on-Write */
+/* PAE PTE/PDE low-32 flags (same bit positions as legacy) */
+#define X86_PTE_PRESENT 0x1ULL
+#define X86_PTE_RW 0x2ULL
+#define X86_PTE_USER 0x4ULL
+#define X86_PTE_PWT 0x8ULL
+#define X86_PTE_PCD 0x10ULL
+#define X86_PTE_COW 0x200ULL /* Bit 9: OS-available, marks Copy-on-Write */
-/* Defined in boot.S (Physical address loaded in CR3, but accessed via virt alias) */
-/* Wait, boot_pd is in BSS. Linker put it at 0xC0xxxxxx.
- So accessing boot_pd directly works fine! */
-extern uint32_t boot_pd[1024];
+/* NX bit (bit 63, only effective if IA32_EFER.NXE = 1) */
+#define X86_PTE_NX (1ULL << 63)
+
+/* Defined in boot.S */
+extern uint64_t boot_pdpt[4];
+extern uint64_t boot_pd0[512];
+extern uint64_t boot_pd1[512];
+extern uint64_t boot_pd2[512];
+extern uint64_t boot_pd3[512];
static uintptr_t g_kernel_as = 0;
__asm__ volatile("invlpg (%0)" : : "r" (vaddr) : "memory");
}
-static uint32_t vmm_flags_to_x86(uint32_t flags) {
- uint32_t x86_flags = 0;
+/* --- PAE address decomposition --- */
+
+static inline uint32_t pae_pdpt_index(uint64_t va) {
+ return (uint32_t)((va >> 30) & 0x3);
+}
+
+static inline uint32_t pae_pd_index(uint64_t va) {
+ return (uint32_t)((va >> 21) & 0x1FF);
+}
+
+static inline uint32_t pae_pt_index(uint64_t va) {
+ return (uint32_t)((va >> 12) & 0x1FF);
+}
+
+/* --- Recursive mapping accessors --- */
+
+static volatile uint64_t* pae_pd_recursive(uint32_t pdpt_i) {
+ return (volatile uint64_t*)(uintptr_t)(0xFFFFC000U + pdpt_i * 0x1000U);
+}
+
+static volatile uint64_t* pae_pt_recursive(uint32_t pdpt_i, uint32_t pd_i) {
+ return (volatile uint64_t*)(uintptr_t)(0xFF800000U + pdpt_i * 0x200000U + pd_i * 0x1000U);
+}
+
+/* --- Flag conversion --- */
+
+static uint64_t vmm_flags_to_x86(uint32_t flags) {
+ uint64_t x86_flags = 0;
if (flags & VMM_FLAG_PRESENT) x86_flags |= X86_PTE_PRESENT;
if (flags & VMM_FLAG_RW) x86_flags |= X86_PTE_RW;
if (flags & VMM_FLAG_USER) x86_flags |= X86_PTE_USER;
if (flags & VMM_FLAG_PWT) x86_flags |= X86_PTE_PWT;
if (flags & VMM_FLAG_PCD) x86_flags |= X86_PTE_PCD;
if (flags & VMM_FLAG_COW) x86_flags |= X86_PTE_COW;
+ if (flags & VMM_FLAG_NX) x86_flags |= X86_PTE_NX;
return x86_flags;
}
-static volatile uint32_t* x86_pd_recursive(void) {
- return (volatile uint32_t*)0xFFFFF000U;
-}
-
-static volatile uint32_t* x86_pt_recursive(uint32_t pd_index) {
- return (volatile uint32_t*)0xFFC00000U + ((uintptr_t)pd_index << 10);
-}
-
-static const volatile uint32_t* vmm_active_pd_virt(void) {
- return x86_pd_recursive();
-}
+/* --- Low-memory page allocator --- */
static void* pmm_alloc_page_low(void) {
- // Bring-up safety: allocate only from identity-mapped area (0-4MB)
- // until we have a general phys->virt mapping.
for (int tries = 0; tries < 1024; tries++) {
void* p = pmm_alloc_page();
if (!p) return 0;
- if ((uintptr_t)p < 0x01000000) {
- return p;
- }
+ if ((uintptr_t)p < 0x01000000) return p;
pmm_free_page(p);
}
return 0;
}
+/* User space covers PDPT indices 0-2 (0x00000000 - 0xBFFFFFFF).
+ * PDPT[3] is kernel (0xC0000000 - 0xFFFFFFFF). */
+#define PAE_USER_PDPT_MAX 3
+
+/* --- Core page operations --- */
+
void vmm_map_page(uint64_t phys, uint64_t virt, uint32_t flags) {
- uint32_t pd_index = (uint32_t)(virt >> 22);
- uint32_t pt_index = (uint32_t)((virt >> 12) & 0x03FF);
+ uint32_t pi = pae_pdpt_index(virt);
+ uint32_t di = pae_pd_index(virt);
+ uint32_t ti = pae_pt_index(virt);
- volatile uint32_t* pd = x86_pd_recursive();
- if ((pd[pd_index] & X86_PTE_PRESENT) == 0) {
- uint32_t pt_phys = (uint32_t)pmm_alloc_page_low();
+ volatile uint64_t* pd = pae_pd_recursive(pi);
+ if ((pd[di] & X86_PTE_PRESENT) == 0) {
+ uint32_t pt_phys = (uint32_t)(uintptr_t)pmm_alloc_page_low();
if (!pt_phys) {
uart_print("[VMM] OOM allocating page table.\n");
return;
}
- uint32_t pde_flags = X86_PTE_PRESENT | X86_PTE_RW;
+ uint64_t pde_flags = X86_PTE_PRESENT | X86_PTE_RW;
if (flags & VMM_FLAG_USER) pde_flags |= X86_PTE_USER;
- pd[pd_index] = pt_phys | pde_flags;
+ pd[di] = (uint64_t)pt_phys | pde_flags;
- // Make sure the page-table window reflects the new PDE before touching it.
- invlpg((uintptr_t)x86_pt_recursive(pd_index));
+ invlpg((uintptr_t)pae_pt_recursive(pi, di));
- volatile uint32_t* pt = x86_pt_recursive(pd_index);
- for (int i = 0; i < 1024; i++) pt[i] = 0;
+ volatile uint64_t* pt = pae_pt_recursive(pi, di);
+ for (int i = 0; i < 512; i++) pt[i] = 0;
}
- if ((flags & VMM_FLAG_USER) && ((pd[pd_index] & X86_PTE_USER) == 0)) {
- pd[pd_index] |= X86_PTE_USER;
+ if ((flags & VMM_FLAG_USER) && ((pd[di] & X86_PTE_USER) == 0)) {
+ pd[di] |= X86_PTE_USER;
}
- volatile uint32_t* pt = x86_pt_recursive(pd_index);
- pt[pt_index] = ((uint32_t)phys) | vmm_flags_to_x86(flags);
- invlpg((uintptr_t)virt);
+ volatile uint64_t* pt = pae_pt_recursive(pi, di);
+ pt[ti] = (phys & 0xFFFFF000ULL) | vmm_flags_to_x86(flags);
+ invlpg((uintptr_t)(uint32_t)virt);
}
-uintptr_t vmm_as_create_kernel_clone(void) {
- uint32_t pd_phys = (uint32_t)pmm_alloc_page_low();
- if (!pd_phys) return 0;
-
- // Initialize the new page directory by temporarily mapping it into the current address
- // space. We avoid assuming any global phys->virt linear mapping exists.
- const uint64_t TMP_PD_VA = 0xBFFFE000ULL;
- vmm_map_page((uint64_t)pd_phys, TMP_PD_VA, VMM_FLAG_PRESENT | VMM_FLAG_RW);
- uint32_t* pd_tmp = (uint32_t*)(uintptr_t)TMP_PD_VA;
- for (int i = 0; i < 1024; i++) pd_tmp[i] = 0;
-
- // Copy current kernel mappings (higher-half PDEs). This must include dynamic mappings
- // created after boot (e.g. initrd physical range mapping).
- const volatile uint32_t* active_pd = vmm_active_pd_virt();
- for (int i = 768; i < 1024; i++) {
- pd_tmp[i] = (uint32_t)active_pd[i];
- }
-
- // Fix recursive mapping: PDE[1023] must point to this PD.
- pd_tmp[1023] = pd_phys | X86_PTE_PRESENT | X86_PTE_RW;
-
- vmm_unmap_page(TMP_PD_VA);
- return (uintptr_t)pd_phys;
+void vmm_unmap_page(uint64_t virt) {
+ uint32_t pi = pae_pdpt_index(virt);
+ uint32_t di = pae_pd_index(virt);
+ uint32_t ti = pae_pt_index(virt);
+
+ volatile uint64_t* pd = pae_pd_recursive(pi);
+ if ((pd[di] & X86_PTE_PRESENT) == 0) return;
+ volatile uint64_t* pt = pae_pt_recursive(pi, di);
+ pt[ti] = 0;
+ invlpg((uintptr_t)(uint32_t)virt);
}
-uintptr_t vmm_as_clone_user(uintptr_t src_as) {
- if (!src_as) return 0;
-
- // Temporary kernel-only mapping in the last user PDE (pdi=767). This avoids touching
- // shared higher-half kernel page tables copied from boot_pd.
- const uintptr_t TMP_MAP_VA = 0xBFF00000U;
-
- uintptr_t new_as = vmm_as_create_kernel_clone();
- if (!new_as) return 0;
-
- uint8_t* tmp = (uint8_t*)kmalloc(4096);
- if (!tmp) {
- vmm_as_destroy(new_as);
- return 0;
- }
-
- // Best-effort clone: copy present user mappings (USER PTEs), ignore kernel half.
- uintptr_t old_as = hal_cpu_get_address_space();
- vmm_as_activate(src_as);
- const volatile uint32_t* src_pd = x86_pd_recursive();
-
- for (uint32_t pdi = 0; pdi < 768; pdi++) {
- uint32_t pde = (uint32_t)src_pd[pdi];
- if ((pde & X86_PTE_PRESENT) == 0) continue;
-
- const volatile uint32_t* src_pt = x86_pt_recursive(pdi);
-
- for (uint32_t pti = 0; pti < 1024; pti++) {
- uint32_t pte = (uint32_t)src_pt[pti];
- if (!(pte & X86_PTE_PRESENT)) continue;
- if ((pte & X86_PTE_USER) == 0) continue;
- const uint32_t x86_flags = pte & 0xFFF;
-
- // Derive VMM flags.
- uint32_t flags = VMM_FLAG_PRESENT;
- if (x86_flags & X86_PTE_RW) flags |= VMM_FLAG_RW;
- if (x86_flags & X86_PTE_USER) flags |= VMM_FLAG_USER;
-
- void* dst_frame = pmm_alloc_page_low();
- if (!dst_frame) {
- vmm_as_destroy(new_as);
- return 0;
- }
-
- uint32_t src_frame = pte & 0xFFFFF000;
-
- uintptr_t va = ((uintptr_t)pdi << 22) | ((uintptr_t)pti << 12);
- vmm_as_map_page(new_as, (uint64_t)(uintptr_t)dst_frame, (uint64_t)va, flags);
+void vmm_set_page_flags(uint64_t virt, uint32_t flags) {
+ uint32_t pi = pae_pdpt_index(virt);
+ uint32_t di = pae_pd_index(virt);
+ uint32_t ti = pae_pt_index(virt);
- // Copy contents by mapping frames into a temporary kernel VA under each address space.
- // src_as is active here
- vmm_map_page((uint64_t)src_frame, (uint64_t)TMP_MAP_VA, VMM_FLAG_PRESENT | VMM_FLAG_RW);
- memcpy(tmp, (const void*)TMP_MAP_VA, 4096);
- vmm_unmap_page((uint64_t)TMP_MAP_VA);
+ volatile uint64_t* pd = pae_pd_recursive(pi);
+ if ((pd[di] & X86_PTE_PRESENT) == 0) return;
- vmm_as_activate(new_as);
- vmm_map_page((uint64_t)(uintptr_t)dst_frame, (uint64_t)TMP_MAP_VA, VMM_FLAG_PRESENT | VMM_FLAG_RW);
- memcpy((void*)TMP_MAP_VA, tmp, 4096);
- vmm_unmap_page((uint64_t)TMP_MAP_VA);
+ volatile uint64_t* pt = pae_pt_recursive(pi, di);
+ uint64_t pte = pt[ti];
+ if (!(pte & X86_PTE_PRESENT)) return;
- vmm_as_activate(src_as);
+ uint64_t phys = pte & 0x000FFFFFFFFFF000ULL;
+ pt[ti] = phys | vmm_flags_to_x86(flags);
+ invlpg((uintptr_t)(uint32_t)virt);
+}
- }
+void vmm_protect_range(uint64_t vaddr, uint64_t len, uint32_t flags) {
+ if (len == 0) return;
+ uint64_t start = vaddr & ~0xFFFULL;
+ uint64_t end = (vaddr + len - 1) & ~0xFFFULL;
+ for (uint64_t va = start;; va += 0x1000ULL) {
+ vmm_set_page_flags(va, flags | VMM_FLAG_PRESENT);
+ if (va == end) break;
}
-
- vmm_as_activate(old_as);
-
- kfree(tmp);
- return new_as;
}
+/* --- Address space management --- */
+
void vmm_as_activate(uintptr_t as) {
if (!as) return;
hal_cpu_set_address_space(as);
void vmm_as_map_page(uintptr_t as, uint64_t phys, uint64_t virt, uint32_t flags) {
if (!as) return;
uintptr_t old_as = hal_cpu_get_address_space();
- if ((old_as & ~(uintptr_t)0xFFFU) != (as & ~(uintptr_t)0xFFFU)) {
+ if ((old_as & ~(uintptr_t)0x1FU) != (as & ~(uintptr_t)0x1FU)) {
vmm_as_activate(as);
vmm_map_page(phys, virt, flags);
vmm_as_activate(old_as);
}
}
+/*
+ * Create a new address space (PDPT + 4 PDs) that shares all kernel mappings
+ * with the current address space. User-space PDs are empty.
+ *
+ * Returns the *physical* address of the new PDPT (suitable for CR3).
+ */
+uintptr_t vmm_as_create_kernel_clone(void) {
+ /* Allocate PDPT (32 bytes, but occupies one page for simplicity) */
+ uint32_t pdpt_phys = (uint32_t)(uintptr_t)pmm_alloc_page_low();
+ if (!pdpt_phys) return 0;
+
+ /* Allocate 4 page directories */
+ uint32_t pd_phys[4];
+ for (int i = 0; i < 4; i++) {
+ pd_phys[i] = (uint32_t)(uintptr_t)pmm_alloc_page_low();
+ if (!pd_phys[i]) {
+ for (int j = 0; j < i; j++) pmm_free_page((void*)(uintptr_t)pd_phys[j]);
+ pmm_free_page((void*)(uintptr_t)pdpt_phys);
+ return 0;
+ }
+ }
+
+ const uint64_t TMP_VA = 0xBFFFE000ULL;
+
+ /* --- Initialize PDPT --- */
+ vmm_map_page((uint64_t)pdpt_phys, TMP_VA, VMM_FLAG_PRESENT | VMM_FLAG_RW);
+ uint64_t* pdpt_tmp = (uint64_t*)(uintptr_t)TMP_VA;
+ memset(pdpt_tmp, 0, PAGE_SIZE);
+ for (int i = 0; i < 4; i++) {
+ pdpt_tmp[i] = (uint64_t)pd_phys[i] | 0x1ULL; /* PRESENT */
+ }
+ vmm_unmap_page(TMP_VA);
+
+ /* --- Initialize each PD --- */
+ for (int i = 0; i < 4; i++) {
+ vmm_map_page((uint64_t)pd_phys[i], TMP_VA, VMM_FLAG_PRESENT | VMM_FLAG_RW);
+ uint64_t* pd_tmp = (uint64_t*)(uintptr_t)TMP_VA;
+ memset(pd_tmp, 0, PAGE_SIZE);
+
+ if (i == 3) {
+ /* Copy kernel PDEs from current PD[3] */
+ volatile uint64_t* active_pd3 = pae_pd_recursive(3);
+ for (int j = 0; j < 512; j++) {
+ pd_tmp[j] = (uint64_t)active_pd3[j];
+ }
+ /* Fix recursive mapping: PD[3][508..511] -> new PD[0..3] */
+ pd_tmp[508] = (uint64_t)pd_phys[0] | X86_PTE_PRESENT | X86_PTE_RW;
+ pd_tmp[509] = (uint64_t)pd_phys[1] | X86_PTE_PRESENT | X86_PTE_RW;
+ pd_tmp[510] = (uint64_t)pd_phys[2] | X86_PTE_PRESENT | X86_PTE_RW;
+ pd_tmp[511] = (uint64_t)pd_phys[3] | X86_PTE_PRESENT | X86_PTE_RW;
+ }
+
+ vmm_unmap_page(TMP_VA);
+ }
+
+ return (uintptr_t)pdpt_phys;
+}
+
void vmm_as_destroy(uintptr_t as) {
if (!as) return;
if (as == g_kernel_as) return;
uintptr_t old_as = hal_cpu_get_address_space();
vmm_as_activate(as);
- volatile uint32_t* pd = x86_pd_recursive();
-
- // Free user page tables + frames for user space.
- for (int pdi = 0; pdi < 768; pdi++) {
- uint32_t pde = (uint32_t)pd[pdi];
- if ((pde & X86_PTE_PRESENT) == 0) continue;
-
- uint32_t pt_phys = pde & 0xFFFFF000;
- volatile uint32_t* pt = x86_pt_recursive((uint32_t)pdi);
-
- for (int pti = 0; pti < 1024; pti++) {
- uint32_t pte = (uint32_t)pt[pti];
- if ((pte & X86_PTE_PRESENT) == 0) continue;
- uint32_t frame = pte & 0xFFFFF000;
- pmm_free_page((void*)(uintptr_t)frame);
- pt[pti] = 0;
- }
- pmm_free_page((void*)(uintptr_t)pt_phys);
- pd[pdi] = 0;
+ /* Free user page tables + frames (PDPT[0..2]) */
+ for (uint32_t pi = 0; pi < PAE_USER_PDPT_MAX; pi++) {
+ volatile uint64_t* pd = pae_pd_recursive(pi);
+ for (uint32_t di = 0; di < 512; di++) {
+ uint64_t pde = pd[di];
+ if ((pde & X86_PTE_PRESENT) == 0) continue;
+
+ uint32_t pt_phys = (uint32_t)(pde & 0xFFFFF000ULL);
+ volatile uint64_t* pt = pae_pt_recursive(pi, di);
+
+ for (int ti = 0; ti < 512; ti++) {
+ uint64_t pte = pt[ti];
+ if ((pte & X86_PTE_PRESENT) == 0) continue;
+ uint32_t frame = (uint32_t)(pte & 0xFFFFF000ULL);
+ pmm_free_page((void*)(uintptr_t)frame);
+ pt[ti] = 0;
+ }
+
+ pmm_free_page((void*)(uintptr_t)pt_phys);
+ pd[di] = 0;
+ }
}
+ /* Read PD physical addresses from PD[3] recursive entries before switching away */
+ volatile uint64_t* pd3 = pae_pd_recursive(3);
+ uint32_t pd_phys[4];
+ pd_phys[0] = (uint32_t)(pd3[508] & 0xFFFFF000ULL);
+ pd_phys[1] = (uint32_t)(pd3[509] & 0xFFFFF000ULL);
+ pd_phys[2] = (uint32_t)(pd3[510] & 0xFFFFF000ULL);
+ pd_phys[3] = (uint32_t)(pd3[511] & 0xFFFFF000ULL);
+
vmm_as_activate(old_as);
+
+ /* Free PDs and PDPT */
+ for (int i = 0; i < 4; i++) {
+ if (pd_phys[i]) pmm_free_page((void*)(uintptr_t)pd_phys[i]);
+ }
pmm_free_page((void*)(uintptr_t)as);
}
-void vmm_set_page_flags(uint64_t virt, uint32_t flags) {
- uint32_t pd_index = virt >> 22;
- uint32_t pt_index = (virt >> 12) & 0x03FF;
+uintptr_t vmm_as_clone_user(uintptr_t src_as) {
+ if (!src_as) return 0;
+
+ const uintptr_t TMP_MAP_VA = 0xBFF00000U;
- volatile uint32_t* pd = x86_pd_recursive();
- if ((pd[pd_index] & X86_PTE_PRESENT) == 0) return;
+ uintptr_t new_as = vmm_as_create_kernel_clone();
+ if (!new_as) return 0;
- volatile uint32_t* pt = x86_pt_recursive(pd_index);
- uint32_t pte = pt[pt_index];
- if (!(pte & X86_PTE_PRESENT)) {
- return;
+ uint8_t* tmp = (uint8_t*)kmalloc(4096);
+ if (!tmp) {
+ vmm_as_destroy(new_as);
+ return 0;
}
- uint32_t phys = pte & 0xFFFFF000;
- pt[pt_index] = phys | vmm_flags_to_x86(flags);
- invlpg((uintptr_t)virt);
-}
+ uintptr_t old_as = hal_cpu_get_address_space();
+ vmm_as_activate(src_as);
-void vmm_protect_range(uint64_t vaddr, uint64_t len, uint32_t flags) {
- if (len == 0) return;
+ for (uint32_t pi = 0; pi < PAE_USER_PDPT_MAX; pi++) {
+ volatile uint64_t* src_pd = pae_pd_recursive(pi);
+ for (uint32_t di = 0; di < 512; di++) {
+ uint64_t pde = src_pd[di];
+ if ((pde & X86_PTE_PRESENT) == 0) continue;
- uint64_t start = vaddr & ~0xFFFULL;
- uint64_t end = (vaddr + len - 1) & ~0xFFFULL;
- for (uint64_t va = start;; va += 0x1000ULL) {
- vmm_set_page_flags(va, flags | VMM_FLAG_PRESENT);
- if (va == end) break;
+ volatile uint64_t* src_pt = pae_pt_recursive(pi, di);
+ for (uint32_t ti = 0; ti < 512; ti++) {
+ uint64_t pte = src_pt[ti];
+ if (!(pte & X86_PTE_PRESENT)) continue;
+ if (!(pte & X86_PTE_USER)) continue;
+
+ uint32_t flags = VMM_FLAG_PRESENT;
+ if (pte & X86_PTE_RW) flags |= VMM_FLAG_RW;
+ if (pte & X86_PTE_USER) flags |= VMM_FLAG_USER;
+
+ void* dst_frame = pmm_alloc_page_low();
+ if (!dst_frame) { kfree(tmp); vmm_as_destroy(new_as); return 0; }
+
+ uint32_t src_frame = (uint32_t)(pte & 0xFFFFF000ULL);
+ uintptr_t va = ((uintptr_t)pi << 30) | ((uintptr_t)di << 21) | ((uintptr_t)ti << 12);
+
+ vmm_as_map_page(new_as, (uint64_t)(uintptr_t)dst_frame, (uint64_t)va, flags);
+
+ vmm_map_page((uint64_t)src_frame, (uint64_t)TMP_MAP_VA, VMM_FLAG_PRESENT | VMM_FLAG_RW);
+ memcpy(tmp, (const void*)TMP_MAP_VA, 4096);
+ vmm_unmap_page((uint64_t)TMP_MAP_VA);
+
+ vmm_as_activate(new_as);
+ vmm_map_page((uint64_t)(uintptr_t)dst_frame, (uint64_t)TMP_MAP_VA, VMM_FLAG_PRESENT | VMM_FLAG_RW);
+ memcpy((void*)TMP_MAP_VA, tmp, 4096);
+ vmm_unmap_page((uint64_t)TMP_MAP_VA);
+
+ vmm_as_activate(src_as);
+ }
+ }
}
-}
-void vmm_unmap_page(uint64_t virt) {
- uint32_t pd_index = virt >> 22;
- uint32_t pt_index = (virt >> 12) & 0x03FF;
-
- volatile uint32_t* pd = x86_pd_recursive();
- if ((pd[pd_index] & X86_PTE_PRESENT) == 0) return;
- volatile uint32_t* pt = x86_pt_recursive(pd_index);
- pt[pt_index] = 0;
- invlpg((uintptr_t)virt);
+ vmm_as_activate(old_as);
+ kfree(tmp);
+ return new_as;
}
uintptr_t vmm_as_clone_user_cow(uintptr_t src_as) {
uintptr_t old_as = hal_cpu_get_address_space();
vmm_as_activate(src_as);
- volatile uint32_t* src_pd = x86_pd_recursive();
-
- for (uint32_t pdi = 0; pdi < 768; pdi++) {
- uint32_t pde = (uint32_t)src_pd[pdi];
- if ((pde & X86_PTE_PRESENT) == 0) continue;
-
- volatile uint32_t* src_pt = x86_pt_recursive(pdi);
-
- for (uint32_t pti = 0; pti < 1024; pti++) {
- uint32_t pte = (uint32_t)src_pt[pti];
- if (!(pte & X86_PTE_PRESENT)) continue;
- if ((pte & X86_PTE_USER) == 0) continue;
-
- uint32_t frame_phys = pte & 0xFFFFF000;
- uintptr_t va = ((uintptr_t)pdi << 22) | ((uintptr_t)pti << 12);
-
- // Mark source page as read-only + CoW if it was writable.
- uint32_t new_pte = frame_phys | X86_PTE_PRESENT | X86_PTE_USER;
- if (pte & X86_PTE_RW) {
- new_pte |= X86_PTE_COW; // Was writable -> CoW
- // Remove RW from source
- src_pt[pti] = new_pte;
- invlpg(va);
- } else {
- new_pte = pte; // Keep as-is (read-only text, etc.)
- }
- // Increment physical frame refcount
- pmm_incref((uintptr_t)frame_phys);
-
- // Map same frame into child with same flags
- vmm_as_map_page(new_as, (uint64_t)frame_phys, (uint64_t)va,
- VMM_FLAG_PRESENT | VMM_FLAG_USER |
- ((new_pte & X86_PTE_COW) ? VMM_FLAG_COW : 0));
+ for (uint32_t pi = 0; pi < PAE_USER_PDPT_MAX; pi++) {
+ volatile uint64_t* src_pd = pae_pd_recursive(pi);
+ for (uint32_t di = 0; di < 512; di++) {
+ uint64_t pde = src_pd[di];
+ if ((pde & X86_PTE_PRESENT) == 0) continue;
+
+ volatile uint64_t* src_pt = pae_pt_recursive(pi, di);
+ for (uint32_t ti = 0; ti < 512; ti++) {
+ uint64_t pte = src_pt[ti];
+ if (!(pte & X86_PTE_PRESENT)) continue;
+ if (!(pte & X86_PTE_USER)) continue;
+
+ uint32_t frame_phys = (uint32_t)(pte & 0xFFFFF000ULL);
+ uintptr_t va = ((uintptr_t)pi << 30) | ((uintptr_t)di << 21) | ((uintptr_t)ti << 12);
+
+ uint64_t new_pte = (uint64_t)frame_phys | X86_PTE_PRESENT | X86_PTE_USER;
+ if (pte & X86_PTE_RW) {
+ new_pte |= X86_PTE_COW;
+ src_pt[ti] = new_pte;
+ invlpg(va);
+ } else {
+ new_pte = pte;
+ }
+
+ pmm_incref((uintptr_t)frame_phys);
+
+ vmm_as_map_page(new_as, (uint64_t)frame_phys, (uint64_t)va,
+ VMM_FLAG_PRESENT | VMM_FLAG_USER |
+ ((new_pte & X86_PTE_COW) ? VMM_FLAG_COW : 0));
+ }
}
}
int vmm_handle_cow_fault(uintptr_t fault_addr) {
uintptr_t va = fault_addr & ~(uintptr_t)0xFFF;
- uint32_t pdi = va >> 22;
- uint32_t pti = (va >> 12) & 0x3FF;
+ uint32_t pi = pae_pdpt_index((uint64_t)va);
+ uint32_t di = pae_pd_index((uint64_t)va);
+ uint32_t ti = pae_pt_index((uint64_t)va);
- if (pdi >= 768) return 0; // Kernel space, not CoW
+ if (pi >= PAE_USER_PDPT_MAX) return 0; /* Kernel space, not CoW */
- volatile uint32_t* pd = x86_pd_recursive();
- if ((pd[pdi] & X86_PTE_PRESENT) == 0) return 0;
+ volatile uint64_t* pd = pae_pd_recursive(pi);
+ if ((pd[di] & X86_PTE_PRESENT) == 0) return 0;
- volatile uint32_t* pt = x86_pt_recursive(pdi);
- uint32_t pte = pt[pti];
+ volatile uint64_t* pt = pae_pt_recursive(pi, di);
+ uint64_t pte = pt[ti];
if (!(pte & X86_PTE_PRESENT)) return 0;
if (!(pte & X86_PTE_COW)) return 0;
- uint32_t old_frame = pte & 0xFFFFF000;
+ uint32_t old_frame = (uint32_t)(pte & 0xFFFFF000ULL);
uint16_t rc = pmm_get_refcount((uintptr_t)old_frame);
if (rc <= 1) {
- // We're the sole owner — just make it writable and clear CoW.
- pt[pti] = old_frame | X86_PTE_PRESENT | X86_PTE_RW | X86_PTE_USER;
+ pt[ti] = (uint64_t)old_frame | X86_PTE_PRESENT | X86_PTE_RW | X86_PTE_USER;
invlpg(va);
return 1;
}
- // Allocate a new frame and copy the page contents.
void* new_frame = pmm_alloc_page();
- if (!new_frame) return 0; // OOM — caller will SIGSEGV
+ if (!new_frame) return 0;
- // Use a temporary kernel VA to copy data.
const uintptr_t TMP_COW_VA = 0xBFFFD000U;
vmm_map_page((uint64_t)(uintptr_t)new_frame, (uint64_t)TMP_COW_VA,
VMM_FLAG_PRESENT | VMM_FLAG_RW);
memcpy((void*)TMP_COW_VA, (const void*)va, 4096);
vmm_unmap_page((uint64_t)TMP_COW_VA);
- // Decrement old frame refcount.
pmm_decref((uintptr_t)old_frame);
- // Map new frame as writable (no CoW).
- pt[pti] = (uint32_t)(uintptr_t)new_frame | X86_PTE_PRESENT | X86_PTE_RW | X86_PTE_USER;
+ pt[ti] = (uint64_t)(uintptr_t)new_frame | X86_PTE_PRESENT | X86_PTE_RW | X86_PTE_USER;
invlpg(va);
return 1;
}
void vmm_init(void) {
- uart_print("[VMM] Higher Half Kernel Active.\n");
+ uart_print("[VMM] PAE paging active.\n");
g_kernel_as = hal_cpu_get_address_space();
-
- // Test mapping
+
+ /* Test mapping */
vmm_map_page(0xB8000, 0xC00B8000, VMM_FLAG_PRESENT | VMM_FLAG_RW);
uart_print("[VMM] Mapped VGA to 0xC00B8000.\n");
}