mzp
/
VLCode-Lite


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
							#!/usr/bin/env node
import { spawnSync } from 'child_process';
import os from 'os';
import { CLIProvider } from '../src/core/llm-provider.js';
import { WorkflowExecutor } from '../src/vl/workflow-executor.js';

const args = process.argv.slice(2);
const runsIdx = args.indexOf('--runs');
const runs = Math.max(1, Number(args[runsIdx + 1] || 1) || 1);
const promptIdx = args.indexOf('--prompt');
const prompt = promptIdx >= 0
  ? String(args[promptIdx + 1] || 'Reply with exactly OK. Do not add punctuation.')
  : 'Reply with exactly OK. Do not add punctuation.';
const claudeModel = process.env.BENCH_CLAUDE_MODEL || 'claude-opus-4-6';
const codexModel = process.env.BENCH_CODEX_MODEL || '';
const timeoutMs = Number(process.env.BENCH_TIMEOUT_MS || 180000);

function now() {
  return Date.now();
}

function summarize(text) {
  return String(text || '').trim().replace(/\s+/g, ' ').slice(0, 160);
}

function commandExists(command) {
  const result = spawnSync('which', [command], {
    encoding: 'utf8',
  });
  return result.status === 0;
}

function mean(values) {
  if (!values.length) return 0;
  return Math.round(values.reduce((sum, value) => sum + value, 0) / values.length);
}

function makeSummary(name, comparable, samples) {
  return {
    name,
    comparable,
    available: true,
    runs: samples.length,
    mean_ms: mean(samples.map((sample) => sample.elapsed_ms)),
    samples,
  };
}

async function benchmark(name, comparable, runner) {
  const samples = [];
  for (let i = 0; i < runs; i++) {
    const started = now();
    const value = await runner();
    samples.push({
      run: i + 1,
      elapsed_ms: now() - started,
      preview: summarize(value),
    });
  }
  return makeSummary(name, comparable, samples);
}

function runClaudeRaw() {
  const result = spawnSync('claude', [
    '--print',
    '--no-session-persistence',
    '--tools',
    '',
    '--model',
    claudeModel,
  ], {
    input: prompt,
    encoding: 'utf8',
    timeout: timeoutMs,
  });

  if (result.status !== 0) {
    throw new Error(result.stderr?.trim() || `claude exited with code ${result.status}`);
  }

  return result.stdout;
}

async function runVLCodeProvider() {
  const provider = new CLIProvider({
    model: claudeModel,
    workDir: process.cwd(),
  });
  const response = await provider.messages.create({
    messages: [{ role: 'user', content: prompt }],
  });
  return response.content?.[0]?.text || '';
}

async function runWorkflowExecutor() {
  const workflow = {
    version: '3.16',
    name: 'BenchmarkWorkflow',
    registry: {
      params: ['prompt(STRING)'],
      services: [],
      apis: [],
      components: [],
      vars: ['$answer(STRING)'],
      files: { inputs: [], artifacts: [] },
    },
    steps: [
      {
        id: 'LLM_Benchmark',
        meta: { title: 'Benchmark LLM Call' },
        in: {
          messages: [{ role: 'user', content: '=prompt' }],
        },
        out: { '$answer': '=_result' },
        next: 'Stop_Done',
      },
      {
        id: 'Stop_Done',
        meta: { title: 'Done' },
      },
    ],
  };

  const executor = new WorkflowExecutor({
    model: claudeModel,
    workDir: process.cwd(),
  });

  await executor.execute(workflow, { prompt }, {
    onError: (message) => {
      throw new Error(message);
    },
  });

  return executor._ctx?.variables?.$answer || '';
}

function runCodexExec() {
  const cliArgs = [
    'exec',
    '--skip-git-repo-check',
    '--sandbox',
    'read-only',
    '--cd',
    os.tmpdir(),
  ];
  if (codexModel) {
    cliArgs.push('--model', codexModel);
  }
  cliArgs.push('Reply with exactly OK. Do not run commands. Do not inspect files.');

  const result = spawnSync('codex', cliArgs, {
    encoding: 'utf8',
    timeout: timeoutMs,
  });

  if (result.status !== 0) {
    throw new Error(result.stderr?.trim() || `codex exited with code ${result.status}`);
  }

  return result.stdout;
}

async function main() {
  const output = {
    generated_at: new Date().toISOString(),
    cwd: process.cwd(),
    runs,
    prompt,
    benchmarks: [],
  };

  if (commandExists('claude')) {
    output.benchmarks.push(await benchmark('claude_raw', true, runClaudeRaw));
    output.benchmarks.push(await benchmark('vlcode_cli_provider', true, runVLCodeProvider));
    output.benchmarks.push(await benchmark('vlcode_workflow_executor', true, runWorkflowExecutor));
  } else {
    output.benchmarks.push({ name: 'claude_raw', available: false, comparable: true, reason: 'claude CLI not installed' });
  }

  if (commandExists('codex')) {
    try {
      output.benchmarks.push(await benchmark('codex_exec', false, runCodexExec));
    } catch (error) {
      output.benchmarks.push({ name: 'codex_exec', available: false, comparable: false, reason: error.message });
    }
  } else {
    output.benchmarks.push({ name: 'codex_exec', available: false, comparable: false, reason: 'codex CLI not installed' });
  }

  if (commandExists('gemini')) {
    output.benchmarks.push({ name: 'gemini_cli', available: true, comparable: false, reason: 'installed but no benchmark runner implemented yet' });
  } else {
    output.benchmarks.push({ name: 'gemini_cli', available: false, comparable: false, reason: 'gemini CLI not installed' });
  }

  console.log(JSON.stringify(output, null, 2));
}

main().catch((error) => {
  console.error(error);
  process.exit(1);
});