#!/usr/bin/env node import { spawnSync } from 'child_process'; import os from 'os'; import { CLIProvider } from '../src/core/llm-provider.js'; import { WorkflowExecutor } from '../src/vl/workflow-executor.js'; const args = process.argv.slice(2); const runsIdx = args.indexOf('--runs'); const runs = Math.max(1, Number(args[runsIdx + 1] || 1) || 1); const promptIdx = args.indexOf('--prompt'); const prompt = promptIdx >= 0 ? String(args[promptIdx + 1] || 'Reply with exactly OK. Do not add punctuation.') : 'Reply with exactly OK. Do not add punctuation.'; const claudeModel = process.env.BENCH_CLAUDE_MODEL || 'claude-opus-4-6'; const codexModel = process.env.BENCH_CODEX_MODEL || ''; const timeoutMs = Number(process.env.BENCH_TIMEOUT_MS || 180000); function now() { return Date.now(); } function summarize(text) { return String(text || '').trim().replace(/\s+/g, ' ').slice(0, 160); } function commandExists(command) { const result = spawnSync('which', [command], { encoding: 'utf8', }); return result.status === 0; } function mean(values) { if (!values.length) return 0; return Math.round(values.reduce((sum, value) => sum + value, 0) / values.length); } function makeSummary(name, comparable, samples) { return { name, comparable, available: true, runs: samples.length, mean_ms: mean(samples.map((sample) => sample.elapsed_ms)), samples, }; } async function benchmark(name, comparable, runner) { const samples = []; for (let i = 0; i < runs; i++) { const started = now(); const value = await runner(); samples.push({ run: i + 1, elapsed_ms: now() - started, preview: summarize(value), }); } return makeSummary(name, comparable, samples); } function runClaudeRaw() { const result = spawnSync('claude', [ '--print', '--no-session-persistence', '--tools', '', '--model', claudeModel, ], { input: prompt, encoding: 'utf8', timeout: timeoutMs, }); if (result.status !== 0) { throw new Error(result.stderr?.trim() || `claude exited with code ${result.status}`); } return result.stdout; } async function runVLCodeProvider() { const provider = new CLIProvider({ model: claudeModel, workDir: process.cwd(), }); const response = await provider.messages.create({ messages: [{ role: 'user', content: prompt }], }); return response.content?.[0]?.text || ''; } async function runWorkflowExecutor() { const workflow = { version: '3.16', name: 'BenchmarkWorkflow', registry: { params: ['prompt(STRING)'], services: [], apis: [], components: [], vars: ['$answer(STRING)'], files: { inputs: [], artifacts: [] }, }, steps: [ { id: 'LLM_Benchmark', meta: { title: 'Benchmark LLM Call' }, in: { messages: [{ role: 'user', content: '=prompt' }], }, out: { '$answer': '=_result' }, next: 'Stop_Done', }, { id: 'Stop_Done', meta: { title: 'Done' }, }, ], }; const executor = new WorkflowExecutor({ model: claudeModel, workDir: process.cwd(), }); await executor.execute(workflow, { prompt }, { onError: (message) => { throw new Error(message); }, }); return executor._ctx?.variables?.$answer || ''; } function runCodexExec() { const cliArgs = [ 'exec', '--skip-git-repo-check', '--sandbox', 'read-only', '--cd', os.tmpdir(), ]; if (codexModel) { cliArgs.push('--model', codexModel); } cliArgs.push('Reply with exactly OK. Do not run commands. Do not inspect files.'); const result = spawnSync('codex', cliArgs, { encoding: 'utf8', timeout: timeoutMs, }); if (result.status !== 0) { throw new Error(result.stderr?.trim() || `codex exited with code ${result.status}`); } return result.stdout; } async function main() { const output = { generated_at: new Date().toISOString(), cwd: process.cwd(), runs, prompt, benchmarks: [], }; if (commandExists('claude')) { output.benchmarks.push(await benchmark('claude_raw', true, runClaudeRaw)); output.benchmarks.push(await benchmark('vlcode_cli_provider', true, runVLCodeProvider)); output.benchmarks.push(await benchmark('vlcode_workflow_executor', true, runWorkflowExecutor)); } else { output.benchmarks.push({ name: 'claude_raw', available: false, comparable: true, reason: 'claude CLI not installed' }); } if (commandExists('codex')) { try { output.benchmarks.push(await benchmark('codex_exec', false, runCodexExec)); } catch (error) { output.benchmarks.push({ name: 'codex_exec', available: false, comparable: false, reason: error.message }); } } else { output.benchmarks.push({ name: 'codex_exec', available: false, comparable: false, reason: 'codex CLI not installed' }); } if (commandExists('gemini')) { output.benchmarks.push({ name: 'gemini_cli', available: true, comparable: false, reason: 'installed but no benchmark runner implemented yet' }); } else { output.benchmarks.push({ name: 'gemini_cli', available: false, comparable: false, reason: 'gemini CLI not installed' }); } console.log(JSON.stringify(output, null, 2)); } main().catch((error) => { console.error(error); process.exit(1); });