| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203 |
- #!/usr/bin/env node
- import { spawnSync } from 'child_process';
- import os from 'os';
- import { CLIProvider } from '../src/core/llm-provider.js';
- import { WorkflowExecutor } from '../src/vl/workflow-executor.js';
- const args = process.argv.slice(2);
- const runsIdx = args.indexOf('--runs');
- const runs = Math.max(1, Number(args[runsIdx + 1] || 1) || 1);
- const promptIdx = args.indexOf('--prompt');
- const prompt = promptIdx >= 0
- ? String(args[promptIdx + 1] || 'Reply with exactly OK. Do not add punctuation.')
- : 'Reply with exactly OK. Do not add punctuation.';
- const claudeModel = process.env.BENCH_CLAUDE_MODEL || 'claude-opus-4-6';
- const codexModel = process.env.BENCH_CODEX_MODEL || '';
- const timeoutMs = Number(process.env.BENCH_TIMEOUT_MS || 180000);
- function now() {
- return Date.now();
- }
- function summarize(text) {
- return String(text || '').trim().replace(/\s+/g, ' ').slice(0, 160);
- }
- function commandExists(command) {
- const result = spawnSync('which', [command], {
- encoding: 'utf8',
- });
- return result.status === 0;
- }
- function mean(values) {
- if (!values.length) return 0;
- return Math.round(values.reduce((sum, value) => sum + value, 0) / values.length);
- }
- function makeSummary(name, comparable, samples) {
- return {
- name,
- comparable,
- available: true,
- runs: samples.length,
- mean_ms: mean(samples.map((sample) => sample.elapsed_ms)),
- samples,
- };
- }
- async function benchmark(name, comparable, runner) {
- const samples = [];
- for (let i = 0; i < runs; i++) {
- const started = now();
- const value = await runner();
- samples.push({
- run: i + 1,
- elapsed_ms: now() - started,
- preview: summarize(value),
- });
- }
- return makeSummary(name, comparable, samples);
- }
- function runClaudeRaw() {
- const result = spawnSync('claude', [
- '--print',
- '--no-session-persistence',
- '--tools',
- '',
- '--model',
- claudeModel,
- ], {
- input: prompt,
- encoding: 'utf8',
- timeout: timeoutMs,
- });
- if (result.status !== 0) {
- throw new Error(result.stderr?.trim() || `claude exited with code ${result.status}`);
- }
- return result.stdout;
- }
- async function runVLCodeProvider() {
- const provider = new CLIProvider({
- model: claudeModel,
- workDir: process.cwd(),
- });
- const response = await provider.messages.create({
- messages: [{ role: 'user', content: prompt }],
- });
- return response.content?.[0]?.text || '';
- }
- async function runWorkflowExecutor() {
- const workflow = {
- version: '3.16',
- name: 'BenchmarkWorkflow',
- registry: {
- params: ['prompt(STRING)'],
- services: [],
- apis: [],
- components: [],
- vars: ['$answer(STRING)'],
- files: { inputs: [], artifacts: [] },
- },
- steps: [
- {
- id: 'LLM_Benchmark',
- meta: { title: 'Benchmark LLM Call' },
- in: {
- messages: [{ role: 'user', content: '=prompt' }],
- },
- out: { '$answer': '=_result' },
- next: 'Stop_Done',
- },
- {
- id: 'Stop_Done',
- meta: { title: 'Done' },
- },
- ],
- };
- const executor = new WorkflowExecutor({
- model: claudeModel,
- workDir: process.cwd(),
- });
- await executor.execute(workflow, { prompt }, {
- onError: (message) => {
- throw new Error(message);
- },
- });
- return executor._ctx?.variables?.$answer || '';
- }
- function runCodexExec() {
- const cliArgs = [
- 'exec',
- '--skip-git-repo-check',
- '--sandbox',
- 'read-only',
- '--cd',
- os.tmpdir(),
- ];
- if (codexModel) {
- cliArgs.push('--model', codexModel);
- }
- cliArgs.push('Reply with exactly OK. Do not run commands. Do not inspect files.');
- const result = spawnSync('codex', cliArgs, {
- encoding: 'utf8',
- timeout: timeoutMs,
- });
- if (result.status !== 0) {
- throw new Error(result.stderr?.trim() || `codex exited with code ${result.status}`);
- }
- return result.stdout;
- }
- async function main() {
- const output = {
- generated_at: new Date().toISOString(),
- cwd: process.cwd(),
- runs,
- prompt,
- benchmarks: [],
- };
- if (commandExists('claude')) {
- output.benchmarks.push(await benchmark('claude_raw', true, runClaudeRaw));
- output.benchmarks.push(await benchmark('vlcode_cli_provider', true, runVLCodeProvider));
- output.benchmarks.push(await benchmark('vlcode_workflow_executor', true, runWorkflowExecutor));
- } else {
- output.benchmarks.push({ name: 'claude_raw', available: false, comparable: true, reason: 'claude CLI not installed' });
- }
- if (commandExists('codex')) {
- try {
- output.benchmarks.push(await benchmark('codex_exec', false, runCodexExec));
- } catch (error) {
- output.benchmarks.push({ name: 'codex_exec', available: false, comparable: false, reason: error.message });
- }
- } else {
- output.benchmarks.push({ name: 'codex_exec', available: false, comparable: false, reason: 'codex CLI not installed' });
- }
- if (commandExists('gemini')) {
- output.benchmarks.push({ name: 'gemini_cli', available: true, comparable: false, reason: 'installed but no benchmark runner implemented yet' });
- } else {
- output.benchmarks.push({ name: 'gemini_cli', available: false, comparable: false, reason: 'gemini CLI not installed' });
- }
- console.log(JSON.stringify(output, null, 2));
- }
- main().catch((error) => {
- console.error(error);
- process.exit(1);
- });
|