benchmark-llm-overhead.js 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. #!/usr/bin/env node
  2. import { spawnSync } from 'child_process';
  3. import os from 'os';
  4. import { CLIProvider } from '../src/core/llm-provider.js';
  5. import { WorkflowExecutor } from '../src/vl/workflow-executor.js';
  6. const args = process.argv.slice(2);
  7. const runsIdx = args.indexOf('--runs');
  8. const runs = Math.max(1, Number(args[runsIdx + 1] || 1) || 1);
  9. const promptIdx = args.indexOf('--prompt');
  10. const prompt = promptIdx >= 0
  11. ? String(args[promptIdx + 1] || 'Reply with exactly OK. Do not add punctuation.')
  12. : 'Reply with exactly OK. Do not add punctuation.';
  13. const claudeModel = process.env.BENCH_CLAUDE_MODEL || 'claude-opus-4-6';
  14. const codexModel = process.env.BENCH_CODEX_MODEL || '';
  15. const timeoutMs = Number(process.env.BENCH_TIMEOUT_MS || 180000);
  16. function now() {
  17. return Date.now();
  18. }
  19. function summarize(text) {
  20. return String(text || '').trim().replace(/\s+/g, ' ').slice(0, 160);
  21. }
  22. function commandExists(command) {
  23. const result = spawnSync('which', [command], {
  24. encoding: 'utf8',
  25. });
  26. return result.status === 0;
  27. }
  28. function mean(values) {
  29. if (!values.length) return 0;
  30. return Math.round(values.reduce((sum, value) => sum + value, 0) / values.length);
  31. }
  32. function makeSummary(name, comparable, samples) {
  33. return {
  34. name,
  35. comparable,
  36. available: true,
  37. runs: samples.length,
  38. mean_ms: mean(samples.map((sample) => sample.elapsed_ms)),
  39. samples,
  40. };
  41. }
  42. async function benchmark(name, comparable, runner) {
  43. const samples = [];
  44. for (let i = 0; i < runs; i++) {
  45. const started = now();
  46. const value = await runner();
  47. samples.push({
  48. run: i + 1,
  49. elapsed_ms: now() - started,
  50. preview: summarize(value),
  51. });
  52. }
  53. return makeSummary(name, comparable, samples);
  54. }
  55. function runClaudeRaw() {
  56. const result = spawnSync('claude', [
  57. '--print',
  58. '--no-session-persistence',
  59. '--tools',
  60. '',
  61. '--model',
  62. claudeModel,
  63. ], {
  64. input: prompt,
  65. encoding: 'utf8',
  66. timeout: timeoutMs,
  67. });
  68. if (result.status !== 0) {
  69. throw new Error(result.stderr?.trim() || `claude exited with code ${result.status}`);
  70. }
  71. return result.stdout;
  72. }
  73. async function runVLCodeProvider() {
  74. const provider = new CLIProvider({
  75. model: claudeModel,
  76. workDir: process.cwd(),
  77. });
  78. const response = await provider.messages.create({
  79. messages: [{ role: 'user', content: prompt }],
  80. });
  81. return response.content?.[0]?.text || '';
  82. }
  83. async function runWorkflowExecutor() {
  84. const workflow = {
  85. version: '3.16',
  86. name: 'BenchmarkWorkflow',
  87. registry: {
  88. params: ['prompt(STRING)'],
  89. services: [],
  90. apis: [],
  91. components: [],
  92. vars: ['$answer(STRING)'],
  93. files: { inputs: [], artifacts: [] },
  94. },
  95. steps: [
  96. {
  97. id: 'LLM_Benchmark',
  98. meta: { title: 'Benchmark LLM Call' },
  99. in: {
  100. messages: [{ role: 'user', content: '=prompt' }],
  101. },
  102. out: { '$answer': '=_result' },
  103. next: 'Stop_Done',
  104. },
  105. {
  106. id: 'Stop_Done',
  107. meta: { title: 'Done' },
  108. },
  109. ],
  110. };
  111. const executor = new WorkflowExecutor({
  112. model: claudeModel,
  113. workDir: process.cwd(),
  114. });
  115. await executor.execute(workflow, { prompt }, {
  116. onError: (message) => {
  117. throw new Error(message);
  118. },
  119. });
  120. return executor._ctx?.variables?.$answer || '';
  121. }
  122. function runCodexExec() {
  123. const cliArgs = [
  124. 'exec',
  125. '--skip-git-repo-check',
  126. '--sandbox',
  127. 'read-only',
  128. '--cd',
  129. os.tmpdir(),
  130. ];
  131. if (codexModel) {
  132. cliArgs.push('--model', codexModel);
  133. }
  134. cliArgs.push('Reply with exactly OK. Do not run commands. Do not inspect files.');
  135. const result = spawnSync('codex', cliArgs, {
  136. encoding: 'utf8',
  137. timeout: timeoutMs,
  138. });
  139. if (result.status !== 0) {
  140. throw new Error(result.stderr?.trim() || `codex exited with code ${result.status}`);
  141. }
  142. return result.stdout;
  143. }
  144. async function main() {
  145. const output = {
  146. generated_at: new Date().toISOString(),
  147. cwd: process.cwd(),
  148. runs,
  149. prompt,
  150. benchmarks: [],
  151. };
  152. if (commandExists('claude')) {
  153. output.benchmarks.push(await benchmark('claude_raw', true, runClaudeRaw));
  154. output.benchmarks.push(await benchmark('vlcode_cli_provider', true, runVLCodeProvider));
  155. output.benchmarks.push(await benchmark('vlcode_workflow_executor', true, runWorkflowExecutor));
  156. } else {
  157. output.benchmarks.push({ name: 'claude_raw', available: false, comparable: true, reason: 'claude CLI not installed' });
  158. }
  159. if (commandExists('codex')) {
  160. try {
  161. output.benchmarks.push(await benchmark('codex_exec', false, runCodexExec));
  162. } catch (error) {
  163. output.benchmarks.push({ name: 'codex_exec', available: false, comparable: false, reason: error.message });
  164. }
  165. } else {
  166. output.benchmarks.push({ name: 'codex_exec', available: false, comparable: false, reason: 'codex CLI not installed' });
  167. }
  168. if (commandExists('gemini')) {
  169. output.benchmarks.push({ name: 'gemini_cli', available: true, comparable: false, reason: 'installed but no benchmark runner implemented yet' });
  170. } else {
  171. output.benchmarks.push({ name: 'gemini_cli', available: false, comparable: false, reason: 'gemini CLI not installed' });
  172. }
  173. console.log(JSON.stringify(output, null, 2));
  174. }
  175. main().catch((error) => {
  176. console.error(error);
  177. process.exit(1);
  178. });