INF6B/niacin/cpp/vm/vm.cpp

#include <iostream>
#include <vector>
#include <memory>
#include <unordered_map>
#include <string>
#include <cstdint>
#include <cmath>
#include <cstring>
#include <functional>
#include <algorithm>
#include <fstream>
#include <sstream>
#include <iomanip>
#include <variant>
#include <optional>

#ifdef _WIN32
#include <intrin.h>
#else
#include <x86intrin.h>
#endif

// ============================================================================
// TYPE DEFINITIONS
// ============================================================================

enum class TypeCode : uint8_t {
    I8 = 0x01, U8 = 0x02, I16 = 0x03, U16 = 0x04,
    I32 = 0x05, U32 = 0x06, F32 = 0x07, BOOL = 0x08,
    CHAR = 0x09, STR = 0x0A
};

enum class Opcode : uint8_t {
    PUSH_CONST = 0x01, PUSH_INT = 0x02, PUSH_FLOAT = 0x03, PUSH_STR = 0x04,
    LOAD_LOCAL = 0x10, STORE_LOCAL = 0x11,
    ADD = 0x20, SUB = 0x21, MUL = 0x22, DIV = 0x23, MOD = 0x24,
    NEG = 0x25, BIT_AND = 0x26, BIT_OR = 0x27, BIT_XOR = 0x28,
    SHL = 0x29, SHR = 0x2A,
    FADD = 0x30, FSUB = 0x31, FMUL = 0x32, FDIV = 0x33, FNEG = 0x34,
    CMP_EQ = 0x40, CMP_NEQ = 0x41, CMP_LT = 0x42, CMP_GT = 0x43,
    CMP_LE = 0x44, CMP_GE = 0x45,
    JMP = 0x50, JMP_IF = 0x51, JMP_IF_NOT = 0x52,
    CALL = 0x60, RET = 0x61,
    CONST_CAST = 0x70, TRUNC = 0x71, TO_FLOAT = 0x72, TO_INT = 0x73,
    DUP = 0x80, POP = 0x81,
    PRINT = 0x90, HALT = 0xA0
};

// ============================================================================
// VALUE REPRESENTATION WITH TYPE PUNNING FOR PERFORMANCE
// ============================================================================

union ValueData {
    int32_t i32;
    uint32_t u32;
    float f32;
    bool b;
    char c;

    ValueData() : i32(0) {}
    explicit ValueData(int32_t v) : i32(v) {}
    explicit ValueData(uint32_t v) : u32(v) {}
    explicit ValueData(float v) : f32(v) {}
    explicit ValueData(bool v) : b(v) {}
    explicit ValueData(char v) : c(v) {}
};

class Value {
private:
    TypeCode type_;
    ValueData data_;
    std::string str_data_; // Only for strings

public:
    Value() : type_(TypeCode::I32), data_() {}

    explicit Value(TypeCode type, int32_t value) : type_(type), data_(value) {}
    explicit Value(TypeCode type, uint32_t value) : type_(type), data_(value) {}
    explicit Value(TypeCode type, float value) : type_(type), data_(value) {}
    explicit Value(TypeCode type, bool value) : type_(type), data_(value) {}
    explicit Value(TypeCode type, char value) : type_(type), data_(value) {}
    explicit Value(const std::string& value) : type_(TypeCode::STR), data_(), str_data_(value) {}

    TypeCode type() const { return type_; }

    int32_t as_i32() const {
        switch (type_) {
        case TypeCode::I8: case TypeCode::I16: case TypeCode::I32: return data_.i32;
        case TypeCode::U8: case TypeCode::U16: case TypeCode::U32: return static_cast<int32_t>(data_.u32);
        case TypeCode::F32: return static_cast<int32_t>(data_.f32);
        case TypeCode::BOOL: return data_.b ? 1 : 0;
        case TypeCode::CHAR: return static_cast<int32_t>(data_.c);
        default: return 0;
        }
    }

    uint32_t as_u32() const {
        switch (type_) {
        case TypeCode::I8: case TypeCode::I16: case TypeCode::I32: return static_cast<uint32_t>(data_.i32);
        case TypeCode::U8: case TypeCode::U16: case TypeCode::U32: return data_.u32;
        case TypeCode::F32: return static_cast<uint32_t>(data_.f32);
        case TypeCode::BOOL: return data_.b ? 1 : 0;
        case TypeCode::CHAR: return static_cast<uint32_t>(data_.c);
        default: return 0;
        }
    }

    float as_f32() const {
        switch (type_) {
        case TypeCode::I8: case TypeCode::I16: case TypeCode::I32: return static_cast<float>(data_.i32);
        case TypeCode::U8: case TypeCode::U16: case TypeCode::U32: return static_cast<float>(data_.u32);
        case TypeCode::F32: return data_.f32;
        case TypeCode::BOOL: return data_.b ? 1.0f : 0.0f;
        case TypeCode::CHAR: return static_cast<float>(data_.c);
        default: return 0.0f;
        }
    }

    bool as_bool() const {
        switch (type_) {
        case TypeCode::BOOL: return data_.b;
        case TypeCode::I8: case TypeCode::I16: case TypeCode::I32: return data_.i32 != 0;
        case TypeCode::U8: case TypeCode::U16: case TypeCode::U32: return data_.u32 != 0;
        case TypeCode::F32: return data_.f32 != 0.0f;
        case TypeCode::CHAR: return data_.c != '\0';
        case TypeCode::STR: return !str_data_.empty();
        default: return false;
        }
    }

    const std::string& as_string() const { return str_data_; }

    std::string to_string() const {
        switch (type_) {
        case TypeCode::I8: case TypeCode::I16: case TypeCode::I32: return std::to_string(data_.i32);
        case TypeCode::U8: case TypeCode::U16: case TypeCode::U32: return std::to_string(data_.u32);
        case TypeCode::F32: return std::to_string(data_.f32);
        case TypeCode::BOOL: return data_.b ? "true" : "false";
        case TypeCode::CHAR: return std::string(1, data_.c);
        case TypeCode::STR: return str_data_;
        default: return "unknown";
        }
    }
};

// ============================================================================
// JIT COMPILATION AND OPTIMIZATION
// ============================================================================

class JITCompiler {
private:
    std::vector<uint8_t> native_code_;
    size_t code_offset_;

public:
    JITCompiler() : code_offset_(0) {}

    void reset() {
        native_code_.clear();
        code_offset_ = 0;
    }

    template<typename T>
    void emit_bytes(const T* data, size_t size) {
        const uint8_t* bytes = reinterpret_cast<const uint8_t*>(data);
        native_code_.insert(native_code_.end(), bytes, bytes + size);
        code_offset_ += size;
    }

    void emit_byte(uint8_t b) {
        native_code_.push_back(b);
        code_offset_++;
    }

    void emit_mov_rax_imm(int32_t value) {
        emit_byte(0x48); // REX.W
        emit_byte(0xB8); // MOV RAX, imm64
        emit_bytes(&value, sizeof(value));
        // Pad to 8 bytes
        int32_t zero = 0;
        emit_bytes(&zero, sizeof(zero));
    }

    void emit_mov_rbx_imm(int32_t value) {
        emit_byte(0x48); // REX.W
        emit_byte(0xBB); // MOV RBX, imm64
        emit_bytes(&value, sizeof(value));
        int32_t zero = 0;
        emit_bytes(&zero, sizeof(zero));
    }

    void emit_add_rax_rbx() {
        emit_byte(0x48); // REX.W
        emit_byte(0x01); // ADD
        emit_byte(0xD8); // RAX, RBX
    }

    void emit_ret() {
        emit_byte(0xC3); // RET
    }

    const std::vector<uint8_t>& get_code() const { return native_code_; }

    // Execute generated native code
    int32_t execute() {
        if (native_code_.empty()) return 0;

        // In a real implementation, we'd use mmap with PROT_EXEC
        // For safety, we'll simulate execution
        std::cout << "[JIT] Executing optimized native code ("
            << native_code_.size() << " bytes)" << std::endl;
        return 42; // Simulated result
    }
};

// ============================================================================
// HOT CODE DETECTOR AND OPTIMIZER
// ============================================================================

class HotCodeDetector {
private:
    struct BlockInfo {
        size_t execution_count;
        size_t start_ip;
        size_t end_ip;
        std::vector<uint8_t> bytecode;
    };

    std::unordered_map<size_t, BlockInfo> hot_blocks_;
    size_t threshold_;

public:
    HotCodeDetector(size_t threshold = 1000) : threshold_(threshold) {}

    void record_execution(size_t ip, const std::vector<uint8_t>& bytecode, size_t block_size) {
        auto it = hot_blocks_.find(ip);
        if (it == hot_blocks_.end()) {
            BlockInfo info;
            info.execution_count = 1;
            info.start_ip = ip;
            info.end_ip = ip + block_size;
            info.bytecode.assign(bytecode.begin() + ip, bytecode.begin() + ip + block_size);
            hot_blocks_[ip] = info;
        }
        else {
            it->second.execution_count++;
        }
    }

    bool is_hot(size_t ip) const {
        auto it = hot_blocks_.find(ip);
        return it != hot_blocks_.end() && it->second.execution_count >= threshold_;
    }

    const BlockInfo* get_hot_block(size_t ip) const {
        auto it = hot_blocks_.find(ip);
        return (it != hot_blocks_.end() && it->second.execution_count >= threshold_) ? &it->second : nullptr;
    }

    void optimize_block(const BlockInfo& block, JITCompiler& jit) {
        // Simple JIT: convert basic arithmetic operations to native code
        size_t ip = 0;
        const auto& code = block.bytecode;

        while (ip < code.size()) {
            Opcode op = static_cast<Opcode>(code[ip++]);

            switch (op) {
            case Opcode::PUSH_INT: {
                // Skip width and value
                ip += 5;
                break;
            }
            case Opcode::ADD: {
                jit.emit_add_rax_rbx();
                break;
            }
            case Opcode::PUSH_CONST: {
                // Would need constant processing
                ip += 4;
                break;
            }
            default:
                // Can't optimize this opcode in simple JIT
                return;
            }
        }

        jit.emit_ret();
    }
};

// ============================================================================
// HIGH-PERFORMANCE VM WITH JIT
// ============================================================================

class OptimizedVM {
private:
    std::vector<uint8_t> bytecode_;
    std::vector<Value> constants_;
    std::vector<std::vector<Value>> functions_;

    // Execution state
    size_t ip_;
    std::vector<Value> stack_;
    std::vector<std::vector<Value>> call_stack_;
    std::vector<Value> locals_;
    bool halted_;

    // Optimization components
    HotCodeDetector hot_detector_;
    JITCompiler jit_compiler_;
    std::unordered_map<size_t, std::function<Value()>> jit_cache_;

    // Inline cache for method calls
    struct InlineCacheEntry {
        size_t target_func;
        size_t call_count;
    };
    std::unordered_map<size_t, InlineCacheEntry> inline_cache_;

public:
    OptimizedVM() : ip_(0), halted_(false), hot_detector_(100) {}

    bool load_bytecode(const std::vector<uint8_t>& bytecode) {
        bytecode_ = bytecode;
        return parse_bytecode();
    }

    bool load_bytecode_from_file(const std::string& filename) {
        std::ifstream file(filename, std::ios::binary);
        if (!file) return false;

        file.seekg(0, std::ios::end);
        size_t size = file.tellg();
        file.seekg(0, std::ios::beg);

        bytecode_.resize(size);
        file.read(reinterpret_cast<char*>(bytecode_.data()), size);

        return parse_bytecode();
    }

private:
    bool parse_bytecode() {
        // Simplified parser - real implementation would parse .popclass format
        if (bytecode_.size() < 4 || std::string(bytecode_.begin(), bytecode_.begin() + 4) != "POPC") {
            return false;
        }
        ip_ = 8; // Skip header
        return true;
    }

    uint8_t fetch_byte() {
        return bytecode_[ip_++];
    }

    uint16_t fetch_u16() {
        uint16_t value;
        std::memcpy(&value, &bytecode_[ip_], sizeof(value));
        ip_ += sizeof(value);
        return value;
    }

    uint32_t fetch_u32() {
        uint32_t value;
        std::memcpy(&value, &bytecode_[ip_], sizeof(value));
        ip_ += sizeof(value);
        return value;
    }

    int32_t fetch_i32() {
        int32_t value;
        std::memcpy(&value, &bytecode_[ip_], sizeof(value));
        ip_ += sizeof(value);
        return value;
    }

    float fetch_f32() {
        float value;
        std::memcpy(&value, &bytecode_[ip_], sizeof(value));
        ip_ += sizeof(value);
        return value;
    }

    void push(const Value& value) {
        stack_.push_back(value);
    }

    Value pop() {
        if (stack_.empty()) throw std::runtime_error("Stack underflow");
        Value value = stack_.back();
        stack_.pop_back();
        return value;
    }

    Value& peek(size_t offset = 0) {
        if (offset >= stack_.size()) throw std::runtime_error("Stack peek out of bounds");
        return stack_[stack_.size() - 1 - offset];
    }

public:
    void run(bool enable_jit = true) {
        ip_ = 0;
        halted_ = false;
        stack_.clear();
        call_stack_.clear();
        locals_.clear();

        // Main execution loop with performance counters
        size_t instructions_executed = 0;
        auto start_time = std::chrono::high_resolution_clock::now();

        while (!halted_ && ip_ < bytecode_.size()) {
            // Check for JIT-optimized block
            if (enable_jit) {
                auto jit_it = jit_cache_.find(ip_);
                if (jit_it != jit_cache_.end()) {
                    Value result = jit_it->second();
                    push(result);
                    continue;
                }

                // Check for hot code blocks
                if (hot_detector_.is_hot(ip_)) {
                    const auto* hot_block = hot_detector_.get_hot_block(ip_);
                    if (hot_block) {
                        std::cout << "[JIT] Compiling hot block at 0x"
                            << std::hex << ip_ << std::dec << std::endl;
                        jit_compiler_.reset();
                        hot_detector_.optimize_block(*hot_block, jit_compiler_);

                        // Cache the JIT function
                        auto jit_func = [this]() -> Value {
                            return Value(TypeCode::I32, jit_compiler_.execute());
                            };
                        jit_cache_[ip_] = jit_func;

                        // Execute JIT version
                        Value result = jit_func();
                        push(result);
                        continue;
                    }
                }
            }

            // Record execution for hot code detection
            hot_detector_.record_execution(ip_, bytecode_, 16); // Monitor 16-byte blocks

            execute_instruction();
            instructions_executed++;

            // Basic bounds checking
            if (stack_.size() > 1000000) {
                throw std::runtime_error("Stack overflow protection");
            }
        }

        auto end_time = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);

        std::cout << "Execution completed: " << instructions_executed
            << " instructions in " << duration.count() << " μs ("
            << (instructions_executed * 1000000.0 / duration.count())
            << " instructions/sec)" << std::endl;
    }

private:
    void execute_instruction() {
        Opcode opcode = static_cast<Opcode>(fetch_byte());

        switch (opcode) {
        case Opcode::PUSH_INT: {
            uint8_t width = fetch_byte();
            int32_t value = fetch_i32();
            push(Value(TypeCode::I32, value));
            break;
        }

        case Opcode::PUSH_FLOAT: {
            float value = fetch_f32();
            push(Value(TypeCode::F32, value));
            break;
        }

        case Opcode::PUSH_CONST: {
            uint32_t const_idx = fetch_u32();
            if (const_idx < constants_.size()) {
                push(constants_[const_idx]);
            }
            break;
        }

        case Opcode::LOAD_LOCAL: {
            uint16_t local_idx = fetch_u16();
            if (local_idx < locals_.size()) {
                push(locals_[local_idx]);
            }
            break;
        }

        case Opcode::STORE_LOCAL: {
            uint16_t local_idx = fetch_u16();
            Value value = pop();
            if (local_idx >= locals_.size()) {
                locals_.resize(local_idx + 1);
            }
            locals_[local_idx] = value;
            break;
        }

        case Opcode::ADD: {
            Value b = pop();
            Value a = pop();

            // Type-based dispatch for performance
            if (a.type() == TypeCode::F32 || b.type() == TypeCode::F32) {
                float result = a.as_f32() + b.as_f32();
                push(Value(TypeCode::F32, result));
            }
            else {
                int32_t result = a.as_i32() + b.as_i32();
                push(Value(TypeCode::I32, result));
            }
            break;
        }

        case Opcode::SUB: {
            Value b = pop();
            Value a = pop();

            if (a.type() == TypeCode::F32 || b.type() == TypeCode::F32) {
                float result = a.as_f32() - b.as_f32();
                push(Value(TypeCode::F32, result));
            }
            else {
                int32_t result = a.as_i32() - b.as_i32();
                push(Value(TypeCode::I32, result));
            }
            break;
        }

        case Opcode::MUL: {
            Value b = pop();
            Value a = pop();

            if (a.type() == TypeCode::F32 || b.type() == TypeCode::F32) {
                float result = a.as_f32() * b.as_f32();
                push(Value(TypeCode::F32, result));
            }
            else {
                int32_t result = a.as_i32() * b.as_i32();
                push(Value(TypeCode::I32, result));
            }
            break;
        }

        case Opcode::DIV: {
            Value b = pop();
            Value a = pop();

            if (b.as_f32() == 0.0f) {
                throw std::runtime_error("Division by zero");
            }

            if (a.type() == TypeCode::F32 || b.type() == TypeCode::F32) {
                float result = a.as_f32() / b.as_f32();
                push(Value(TypeCode::F32, result));
            }
            else {
                int32_t result = a.as_i32() / b.as_i32();
                push(Value(TypeCode::I32, result));
            }
            break;
        }

        case Opcode::CMP_EQ: {
            Value b = pop();
            Value a = pop();
            bool result = (a.as_i32() == b.as_i32());
            push(Value(TypeCode::BOOL, result));
            break;
        }

        case Opcode::CMP_LT: {
            Value b = pop();
            Value a = pop();

            if (a.type() == TypeCode::F32 || b.type() == TypeCode::F32) {
                bool result = a.as_f32() < b.as_f32();
                push(Value(TypeCode::BOOL, result));
            }
            else {
                bool result = a.as_i32() < b.as_i32();
                push(Value(TypeCode::BOOL, result));
            }
            break;
        }

        case Opcode::JMP: {
            int32_t offset = fetch_i32();
            ip_ += offset;
            break;
        }

        case Opcode::JMP_IF: {
            int32_t offset = fetch_i32();
            Value cond = pop();
            if (cond.as_bool()) {
                ip_ += offset;
            }
            break;
        }

        case Opcode::JMP_IF_NOT: {
            int32_t offset = fetch_i32();
            Value cond = pop();
            if (!cond.as_bool()) {
                ip_ += offset;
            }
            break;
        }

        case Opcode::CALL: {
            uint16_t func_idx = fetch_u16();
            uint8_t arg_count = fetch_byte();

            // Prepare arguments
            std::vector<Value> args;
            for (int i = 0; i < arg_count; ++i) {
                args.push_back(pop());
            }
            std::reverse(args.begin(), args.end());

            // Save execution state
            call_stack_.push_back(locals_);
            call_stack_.push_back(std::vector<Value>{Value(TypeCode::I32, static_cast<int32_t>(ip_))});

            // Set up new frame
            locals_ = args;
            ip_ = 0; // Simplified - real implementation would use function table
            break;
        }

        case Opcode::RET: {
            uint8_t has_value = fetch_byte();
            Value return_value;
            if (has_value) {
                return_value = pop();
            }

            if (call_stack_.size() < 2) {
                halted_ = true;
                break;
            }

            // Restore execution state
            std::vector<Value> return_ip_vec = call_stack_.back();
            call_stack_.pop_back();
            locals_ = call_stack_.back();
            call_stack_.pop_back();

            if (!return_ip_vec.empty()) {
                ip_ = return_ip_vec[0].as_i32();
            }

            if (has_value) {
                push(return_value);
            }
            break;
        }

        case Opcode::DUP: {
            if (!stack_.empty()) {
                push(stack_.back());
            }
            break;
        }

        case Opcode::POP: {
            if (!stack_.empty()) {
                stack_.pop_back();
            }
            break;
        }

        case Opcode::PRINT: {
            Value value = pop();
            std::cout << value.to_string() << std::endl;
            break;
        }

        case Opcode::HALT: {
            halted_ = true;
            break;
        }

        default:
            throw std::runtime_error("Unknown opcode: " + std::to_string(static_cast<int>(opcode)));
        }
    }
};

// ============================================================================
// PERFORMANCE PROFILER
// ============================================================================

class Profiler {
private:
    struct InstructionProfile {
        size_t execution_count;
        uint64_t total_cycles;
        std::string name;
    };

    std::unordered_map<Opcode, InstructionProfile> profiles_;
    uint64_t start_cycles_;

public:
    Profiler() {
        // Initialize profile names
        profiles_[Opcode::ADD] = { 0, 0, "ADD" };
        profiles_[Opcode::SUB] = { 0, 0, "SUB" };
        profiles_[Opcode::MUL] = { 0, 0, "MUL" };
        profiles_[Opcode::DIV] = { 0, 0, "DIV" };
        profiles_[Opcode::CALL] = { 0, 0, "CALL" };
        profiles_[Opcode::RET] = { 0, 0, "RET" };
        // Add more as needed
    }

    void start_measurement() {
        start_cycles_ = __rdtsc();
    }

    void record_instruction(Opcode op, size_t ip) {
        uint64_t end_cycles = __rdtsc();
        uint64_t cycles = end_cycles - start_cycles_;

        auto& profile = profiles_[op];
        profile.execution_count++;
        profile.total_cycles += cycles;

        start_cycles_ = end_cycles;
    }

    void print_report() const {
        std::cout << "\n=== PERFORMANCE PROFILE ===" << std::endl;
        std::cout << std::setw(10) << "Instruction"
            << std::setw(12) << "Count"
            << std::setw(12) << "Total Cycles"
            << std::setw(12) << "Avg Cycles" << std::endl;
        std::cout << std::string(50, '-') << std::endl;

        for (const auto& [opcode, profile] : profiles_) {
            if (profile.execution_count > 0) {
                double avg_cycles = static_cast<double>(profile.total_cycles) / profile.execution_count;
                std::cout << std::setw(10) << profile.name
                    << std::setw(12) << profile.execution_count
                    << std::setw(12) << profile.total_cycles
                    << std::setw(12) << std::fixed << std::setprecision(2) << avg_cycles << std::endl;
            }
        }
    }
};

// ============================================================================
// MEMORY POOL FOR EFFICIENT VALUE ALLOCATION
// ============================================================================

class ValuePool {
private:
    static const size_t POOL_SIZE = 4096;
    std::vector<Value> pool_;
    size_t current_index_;

public:
    ValuePool() : current_index_(0) {
        pool_.reserve(POOL_SIZE);
    }

    Value* allocate() {
        if (current_index_ >= pool_.size()) {
            pool_.emplace_back();
        }
        return &pool_[current_index_++];
    }

    void reset() {
        current_index_ = 0;
    }

    size_t size() const { return current_index_; }
};

// ============================================================================
// MAIN DEMONSTRATION
// ============================================================================

int main() {
    std::cout << "High-Performance POP VM with JIT Optimizations" << std::endl;
    std::cout << "=============================================" << std::endl;

    // Create a simple test program: calculate factorial(5)
    std::vector<uint8_t> test_bytecode = {
        // PUSH_INT 5 (factorial of 5)
        0x02, 0x20, 0x05, 0x00, 0x00, 0x00,

        // PUSH_INT 1 (accumulator)
        0x02, 0x20, 0x01, 0x00, 0x00, 0x00,

        // Label: loop_start
        // DUP2 (duplicate n and acc)
        0x80,
        0x02, 0x20, 0x02, 0x00, 0x00, 0x00, // PUSH_INT 2
        0x80, // DUP to get n again
        0x42, // CMP_LT (n < 2)

        // JMP_IF to end
        0x52, 0x0A, 0x00, 0x00, 0x00, // Jump forward 10 bytes if true

        // Multiply acc * n
        0x22, // MUL

        // Decrement n: PUSH_INT 1, SUB
        0x02, 0x20, 0x01, 0x00, 0x00, 0x00,
        0x21, // SUB

        // Jump back to loop_start
        0x50, 0xEC, 0xFF, 0xFF, 0xFF, // Jump back 20 bytes

        // Label: end
        // POP the remaining n, leaving acc on stack
        0x81, // POP

        // PRINT result
        0x90,

        // HALT
        0xA0
    };

    OptimizedVM vm;

    // Test interpreted execution
    std::cout << "\n=== INTERPRETED EXECUTION ===" << std::endl;
    vm.load_bytecode(test_bytecode);
    vm.run(false); // Disable JIT for baseline

    // Test JIT-optimized execution
    std::cout << "\n=== JIT-OPTIMIZED EXECUTION ===" << std::endl;
    vm.load_bytecode(test_bytecode);
    vm.run(true); // Enable JIT

    // Performance comparison with different optimization levels
    std::cout << "\n=== PERFORMANCE COMPARISON ===" << std::endl;

    const size_t ITERATIONS = 10000;
    auto run_benchmark = [&](bool use_jit, const std::string& name) {
        auto start = std::chrono::high_resolution_clock::now();

        for (size_t i = 0; i < ITERATIONS; ++i) {
            OptimizedVM bench_vm;
            bench_vm.load_bytecode(test_bytecode);
            bench_vm.run(use_jit);
        }

        auto end = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);

        std::cout << name << ": " << duration.count() << " ms for "
            << ITERATIONS << " iterations" << std::endl;
        };

    run_benchmark(false, "Interpreted ");
    run_benchmark(true, "JIT         ");

    return 0;
}