compare-llm-codegen.js 35 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012
  1. #!/usr/bin/env node
  2. import fs from 'fs/promises';
  3. import fsSync from 'fs';
  4. import os from 'os';
  5. import path from 'path';
  6. import { spawn, execSync } from 'child_process';
  7. import { performance } from 'perf_hooks';
  8. import { PARSEVL_URL } from '../src/data/versions.js';
  9. import { getCookie } from '../src/server/helpers.js';
  10. import { WorkflowExecutor } from '../src/vl/workflow-executor.js';
  11. import { VLProjectContext } from '../src/vl/project-context.js';
  12. import { createVLValidateTool } from '../src/tools/vl-validate.js';
  13. import { extractFromFileTree, validateMeta } from '../src/vl/metadata-extractor.js';
  14. const MODEL = process.env.VL_CODE_MODEL || 'claude-opus-4-6';
  15. const TARGET_LANG = 'zh-CN';
  16. const TODAY = new Date().toISOString().slice(0, 10);
  17. const TEST_ROOT = path.join(os.homedir(), 'Documents', 'VLProjects', '_tests');
  18. const REPORT_DIR = path.join(process.cwd(), 'docs', 'benchmarks');
  19. const WORKFLOW_PATH = path.join(process.cwd(), '.vl-code', 'workflows', '6-file-codegen.json');
  20. const DOCCENTER_BASE = 'https://v4pre.visuallogic.ai/api/12027022';
  21. const DIRECT_REQUIRED_PATHS = [
  22. 'Database/CampusOps.vdb',
  23. 'Theme/CampusOpsTheme.vth',
  24. 'Services/OperationsOverview.vs',
  25. 'Services/ScheduleService.vs',
  26. 'Services/WorkOrderService.vs',
  27. 'Services/AlertService.vs',
  28. 'Services/SettingsService.vs',
  29. 'ExtComponents/KpiCard.cp',
  30. 'ExtComponents/StatusPill.cp',
  31. 'ExtComponents/FilterToolbar.cp',
  32. 'ExtComponents/AlertListItem.cp',
  33. 'Sections/OverviewPage.sc',
  34. 'Sections/ScheduleBoard.sc',
  35. 'Sections/WorkOrderDesk.sc',
  36. 'Sections/AlertCenter.sc',
  37. 'Sections/SettingsPage.sc',
  38. 'Apps/CampusOpsApp.vx',
  39. ];
  40. const DIRECT_CONTRACT = {
  41. projectName: 'CampusOps',
  42. theme: {
  43. filePath: 'Theme/CampusOpsTheme.vth',
  44. style: 'enterprise-light',
  45. palette: {
  46. primary: '#0F7B6C',
  47. primaryHover: '#136E62',
  48. warning: '#F59E0B',
  49. danger: '#DC2626',
  50. success: '#16A34A',
  51. slate: '#334155',
  52. surface: '#F8FAFC',
  53. surfaceElevated: '#FFFFFF',
  54. },
  55. direction: 'deep teal primary, slate neutrals, soft elevated cards, pill filters, enterprise operations dashboard',
  56. },
  57. dataModel: {
  58. filePath: 'Database/CampusOps.vdb',
  59. tables: [
  60. {
  61. id: 'Campus',
  62. fields: [
  63. { name: 'name', type: 'STRING' },
  64. { name: 'region', type: 'STRING' },
  65. { name: 'manager', type: 'STRING' },
  66. { name: 'activeAlerts', type: 'INT' },
  67. ],
  68. },
  69. {
  70. id: 'Technician',
  71. fields: [
  72. { name: 'name', type: 'STRING' },
  73. { name: 'campusId', type: 'INT' },
  74. { name: 'skillTag', type: 'STRING' },
  75. { name: 'shiftStatus', type: 'STRING' },
  76. { name: 'utilizationRate', type: 'FLOAT' },
  77. ],
  78. },
  79. {
  80. id: 'WorkOrder',
  81. fields: [
  82. { name: 'campusId', type: 'INT' },
  83. { name: 'title', type: 'STRING' },
  84. { name: 'priority', type: 'STRING' },
  85. { name: 'status', type: 'STRING' },
  86. { name: 'assigneeId', type: 'INT' },
  87. { name: 'slaHours', type: 'INT' },
  88. ],
  89. },
  90. {
  91. id: 'AlertRule',
  92. fields: [
  93. { name: 'campusId', type: 'INT' },
  94. { name: 'ruleName', type: 'STRING' },
  95. { name: 'thresholdValue', type: 'FLOAT' },
  96. { name: 'enabled', type: 'BOOL' },
  97. ],
  98. },
  99. {
  100. id: 'AlertEvent',
  101. fields: [
  102. { name: 'campusId', type: 'INT' },
  103. { name: 'ruleId', type: 'INT' },
  104. { name: 'severity', type: 'STRING' },
  105. { name: 'status', type: 'STRING' },
  106. { name: 'message', type: 'STRING' },
  107. ],
  108. },
  109. {
  110. id: 'UserPreference',
  111. fields: [
  112. { name: 'density', type: 'STRING' },
  113. { name: 'defaultCampusId', type: 'INT' },
  114. { name: 'emailDigest', type: 'BOOL' },
  115. ],
  116. },
  117. ],
  118. },
  119. services: [
  120. {
  121. domainId: 'OperationsOverview',
  122. filePath: 'Services/OperationsOverview.vs',
  123. purpose: 'dashboard KPIs and summary cards',
  124. methods: [
  125. { id: 'GetOverviewMetrics', params: 'campusId(INT)', returns: '{success:BOOL,data:OBJECT}' },
  126. ],
  127. },
  128. {
  129. domainId: 'ScheduleService',
  130. filePath: 'Services/ScheduleService.vs',
  131. purpose: 'technician assignment list and shift filters',
  132. methods: [
  133. { id: 'ListAssignments', params: 'campusId(INT),shiftStatus(STRING)', returns: '{success:BOOL,data:[{}]}' },
  134. ],
  135. },
  136. {
  137. domainId: 'WorkOrderService',
  138. filePath: 'Services/WorkOrderService.vs',
  139. purpose: 'work order list and status updates',
  140. methods: [
  141. { id: 'ListWorkOrders', params: 'campusId(INT),priority(STRING),status(STRING)', returns: '{success:BOOL,data:[{}]}' },
  142. { id: 'UpdateWorkOrderStatus', params: 'workOrderId(INT),status(STRING)', returns: '{success:BOOL}' },
  143. ],
  144. },
  145. {
  146. domainId: 'AlertService',
  147. filePath: 'Services/AlertService.vs',
  148. purpose: 'alert event list and acknowledgement',
  149. methods: [
  150. { id: 'ListAlerts', params: 'campusId(INT),severity(STRING),status(STRING)', returns: '{success:BOOL,data:[{}]}' },
  151. { id: 'AcknowledgeAlert', params: 'alertId(INT)', returns: '{success:BOOL}' },
  152. ],
  153. },
  154. {
  155. domainId: 'SettingsService',
  156. filePath: 'Services/SettingsService.vs',
  157. purpose: 'settings and threshold preferences',
  158. methods: [
  159. { id: 'GetSettings', params: 'campusId(INT)', returns: '{success:BOOL,data:OBJECT}' },
  160. { id: 'SaveSettings', params: 'campusId(INT),density(STRING),threshold(FLOAT)', returns: '{success:BOOL}' },
  161. ],
  162. },
  163. ],
  164. components: [
  165. {
  166. id: 'KpiCard',
  167. filePath: 'ExtComponents/KpiCard.cp',
  168. purpose: 'title, numeric value, helper text, optional intent tone',
  169. },
  170. {
  171. id: 'StatusPill',
  172. filePath: 'ExtComponents/StatusPill.cp',
  173. purpose: 'compact status chip for priority or lifecycle states',
  174. },
  175. {
  176. id: 'FilterToolbar',
  177. filePath: 'ExtComponents/FilterToolbar.cp',
  178. purpose: 'filter row with campus and status selectors plus clear action',
  179. },
  180. {
  181. id: 'AlertListItem',
  182. filePath: 'ExtComponents/AlertListItem.cp',
  183. purpose: 'alert row with severity, message, meta, and acknowledge button',
  184. },
  185. ],
  186. sections: [
  187. {
  188. id: 'OverviewPage',
  189. filePath: 'Sections/OverviewPage.sc',
  190. consumesServices: ['OperationsOverview.GetOverviewMetrics'],
  191. usesComponents: ['KpiCard'],
  192. purpose: 'overview dashboard with KPI cards and campus summary rows',
  193. },
  194. {
  195. id: 'ScheduleBoard',
  196. filePath: 'Sections/ScheduleBoard.sc',
  197. consumesServices: ['ScheduleService.ListAssignments'],
  198. usesComponents: ['FilterToolbar', 'StatusPill'],
  199. purpose: 'schedule and technician assignment table',
  200. },
  201. {
  202. id: 'WorkOrderDesk',
  203. filePath: 'Sections/WorkOrderDesk.sc',
  204. consumesServices: ['WorkOrderService.ListWorkOrders', 'WorkOrderService.UpdateWorkOrderStatus'],
  205. usesComponents: ['FilterToolbar', 'StatusPill'],
  206. purpose: 'work order list with status change action',
  207. },
  208. {
  209. id: 'AlertCenter',
  210. filePath: 'Sections/AlertCenter.sc',
  211. consumesServices: ['AlertService.ListAlerts', 'AlertService.AcknowledgeAlert'],
  212. usesComponents: ['FilterToolbar', 'AlertListItem'],
  213. purpose: 'alert center with severity list and acknowledgement actions',
  214. },
  215. {
  216. id: 'SettingsPage',
  217. filePath: 'Sections/SettingsPage.sc',
  218. consumesServices: ['SettingsService.GetSettings', 'SettingsService.SaveSettings'],
  219. usesComponents: ['StatusPill'],
  220. purpose: 'alert threshold and density settings form',
  221. },
  222. ],
  223. app: {
  224. id: 'CampusOpsApp',
  225. filePath: 'Apps/CampusOpsApp.vx',
  226. routes: [
  227. { path: 'overview', sectionId: 'OverviewPage' },
  228. { path: 'schedule', sectionId: 'ScheduleBoard' },
  229. { path: 'work-orders', sectionId: 'WorkOrderDesk' },
  230. { path: 'alerts', sectionId: 'AlertCenter' },
  231. { path: 'settings', sectionId: 'SettingsPage' },
  232. ],
  233. },
  234. };
  235. const APP_REQUIREMENT = `
  236. Build a desktop-first operations cockpit called CampusOps for a multi-campus facilities team.
  237. Business scope:
  238. - Roles: dispatcher and supervisor.
  239. - Routes/pages: /overview, /schedule, /work-orders, /alerts, /settings.
  240. - Domain data: Campus, Technician, WorkOrder, AlertRule, AlertEvent.
  241. - Key interactions:
  242. - Overview shows KPI cards for open work orders, overdue SLA, active alerts, and technician utilization.
  243. - Schedule page lists technician assignments by campus and lets users filter by campus, technician, and shift status.
  244. - Work order page lists work orders with filters by campus, priority, and status, and supports changing the order status.
  245. - Alert center lists alert events with severity, source campus, and acknowledgement/escalation actions.
  246. - Settings page edits alert thresholds and dashboard density preferences.
  247. Design direction:
  248. - Theme should feel enterprise and operational: deep teal primary, slate surfaces, amber warning, red danger, soft elevated cards, pill filters.
  249. - Prefer card/list/table layouts instead of advanced chart widgets.
  250. - Keep interactions compile-safe and easy to preview.
  251. Implementation constraints:
  252. - Use the exact file names listed below when generating the direct baseline.
  253. - Generate realistic mock data in the database file.
  254. - Use only VL-safe constructs; avoid speculative widgets or syntax.
  255. `.trim();
  256. const DIRECT_VL_DIGEST = `
  257. Latest reference baseline:
  258. - Latest DocCenter VL syntax document reports version 3.6.
  259. - Latest Theme doc is Theme-Enterprise-6.5 with styleSpaceVersion 1.6.
  260. - Current VLCode-Lite runtime, validator, and workflow toolchain in this repo are still pinned to VL 3.5.
  261. Compatibility target for this benchmark:
  262. - Every generated VL file must start with // VL_VERSION:3.5.
  263. - Stay within the shared 3.5-safe subset even when the latest syntax doc is newer.
  264. Essential VL rules distilled from the latest docs:
  265. - File types: .vx App, .sc Section, .cp Component, .vs ServiceDomain, .vdb Database, .vth Theme.
  266. - Cross references: App -> Section/Component only; Section -> ServiceDomain/Component only; Service and Component do not cross-reference others.
  267. - Indentation uses leading hyphens, never spaces.
  268. - App required section order: SysConfig, Frontend Global Vars, Frontend Derived Vars, Frontend Tree, Frontend Event Handlers, Frontend Internal Methods, Frontend Pipeline Funcs.
  269. - Section required section order: Frontend Public Props, Frontend Public Events, Frontend Public Methods, Frontend Global Vars, Frontend Derived Vars, Frontend Tree, Frontend Event Handlers, Frontend Internal Methods, Frontend Pipeline Funcs.
  270. - Component required section order: Frontend Public Props, Frontend Public Events, Frontend Derived Vars, Frontend Tree, Frontend Event Handlers, Frontend Internal Methods, Frontend Pipeline Funcs.
  271. - ServiceDomain required section order: Backend Environment Vars, Backend Tree, Services, Backend Event Handlers, Transactions, Backend Internal Methods, Backend Pipeline Funcs.
  272. - Style values must be static string literals only. Do not use ternary expressions or variable expressions inside style:.
  273. - Do not emit CSS-only skin props such as border-collapse.
  274. - Do not bind events directly on <For-*> nodes; bind on the interactive child inside the loop.
  275. - Prefer simple cards, rows, columns, text, buttons, input/select-like controls, if/for blocks, and service calls.
  276. Theme rules distilled from the latest docs:
  277. - Theme file order: # Meta -> # Design Tokens (optional) -> # Point Slot Values -> # Overrides (optional).
  278. - The heading must be exactly # Point Slot Values. Do not use legacy # Coordinate Values.
  279. - base_theme should stay Platform/Theme-Default-Light@1 for this benchmark.
  280. - Use enterprise-style point slots such as intent.*, emphasis.*, shape.*, surface.*, textRole.*, state.*, size.*, space.* when overriding appearance.
  281. `.trim();
  282. function normalizeCookie(cookie) {
  283. if (!cookie) return '';
  284. return String(cookie).startsWith('ih5bearer=') ? String(cookie) : `ih5bearer=${cookie}`;
  285. }
  286. function slugDate(dateText) {
  287. return String(dateText || '')
  288. .replace(/-/g, '')
  289. .slice(0, 8);
  290. }
  291. function projectNameWithFallback(baseName) {
  292. let candidate = baseName;
  293. let n = 2;
  294. while (fsSync.existsSync(path.join(TEST_ROOT, candidate))) {
  295. candidate = baseName.replace(/Test$/, `Run${n}Test`);
  296. n += 1;
  297. }
  298. return candidate;
  299. }
  300. async function ensureProjectScaffold(projectDir) {
  301. for (const rel of ['Apps', 'Sections', 'ExtComponents', 'Services', 'Database', 'Theme', 'Process', '.vl-code']) {
  302. await fs.mkdir(path.join(projectDir, rel), { recursive: true });
  303. }
  304. }
  305. async function fetchDoc(docId, cookie) {
  306. const res = await fetch(`${DOCCENTER_BASE}/SERVICE_DocCenter_GetDocById`, {
  307. method: 'POST',
  308. headers: {
  309. 'Content-Type': 'application/json',
  310. 'Cookie': normalizeCookie(cookie),
  311. },
  312. body: JSON.stringify({ docId }),
  313. });
  314. const data = await res.json();
  315. return data?.data?.currentContent || '';
  316. }
  317. async function runClaudePrompt(prompt, { systemPrompt = '', model = MODEL, timeoutMs = 20 * 60 * 1000 } = {}) {
  318. return await new Promise((resolve, reject) => {
  319. const args = ['--print', '--no-session-persistence', '--model', model, '--tools', ''];
  320. if (systemPrompt) args.push('--system-prompt', systemPrompt);
  321. const env = { ...process.env, NO_PROXY: 'localhost,127.0.0.1,::1' };
  322. delete env.CLAUDECODE;
  323. const proc = spawn('claude', args, {
  324. stdio: ['pipe', 'pipe', 'pipe'],
  325. env,
  326. });
  327. let stdout = '';
  328. let stderr = '';
  329. let finished = false;
  330. const timer = setTimeout(() => {
  331. proc.kill('SIGTERM');
  332. reject(new Error(`claude prompt timed out after ${Math.round(timeoutMs / 1000)}s`));
  333. }, timeoutMs);
  334. proc.stdout.on('data', (chunk) => {
  335. stdout += chunk.toString();
  336. });
  337. proc.stderr.on('data', (chunk) => {
  338. stderr += chunk.toString();
  339. });
  340. proc.on('error', (err) => {
  341. if (finished) return;
  342. finished = true;
  343. clearTimeout(timer);
  344. reject(err);
  345. });
  346. proc.on('close', (code) => {
  347. if (finished) return;
  348. finished = true;
  349. clearTimeout(timer);
  350. if (code !== 0) {
  351. reject(new Error(`claude exited with code ${code}: ${stderr.slice(0, 600)}`));
  352. return;
  353. }
  354. resolve(stdout.trim());
  355. });
  356. proc.stdin.write(prompt);
  357. proc.stdin.end();
  358. });
  359. }
  360. function extractJson(text) {
  361. const trimmed = String(text || '').trim();
  362. if (!trimmed) throw new Error('empty response');
  363. const fence = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/i);
  364. const candidate = fence ? fence[1].trim() : trimmed;
  365. try {
  366. return JSON.parse(candidate);
  367. } catch {}
  368. const firstBrace = candidate.indexOf('{');
  369. const lastBrace = candidate.lastIndexOf('}');
  370. if (firstBrace >= 0 && lastBrace > firstBrace) {
  371. const sliced = candidate.slice(firstBrace, lastBrace + 1);
  372. return JSON.parse(sliced);
  373. }
  374. throw new Error('could not extract JSON from response');
  375. }
  376. async function writeObjectFiles(projectDir, filesMap) {
  377. const written = [];
  378. for (const [relPath, content] of Object.entries(filesMap || {})) {
  379. const target = path.join(projectDir, relPath);
  380. await fs.mkdir(path.dirname(target), { recursive: true });
  381. await fs.writeFile(target, String(content), 'utf-8');
  382. written.push(relPath);
  383. }
  384. return written.sort();
  385. }
  386. async function mapLimit(items, limit, iterator) {
  387. const results = new Array(items.length);
  388. let nextIndex = 0;
  389. async function worker() {
  390. while (nextIndex < items.length) {
  391. const current = nextIndex;
  392. nextIndex += 1;
  393. results[current] = await iterator(items[current], current);
  394. }
  395. }
  396. const workers = Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, () => worker());
  397. await Promise.all(workers);
  398. return results;
  399. }
  400. function buildDirectFilePrompt({ kind, target, latestThemeDoc }) {
  401. const contractJson = JSON.stringify(DIRECT_CONTRACT, null, 2);
  402. const base = `
  403. You are generating one VL file for the CampusOps benchmark.
  404. ${DIRECT_VL_DIGEST}
  405. Project contract:
  406. <project-contract>
  407. ${contractJson}
  408. </project-contract>
  409. Global requirement:
  410. ${APP_REQUIREMENT}
  411. Common output rules:
  412. - Output only raw VL source code for the requested file.
  413. - Do not use markdown fences.
  414. - The first line must be // VL_VERSION:3.5
  415. - Keep identifiers and file references exactly aligned with the project contract.
  416. - Use compile-safe VL only.
  417. `.trim();
  418. if (kind === 'database') {
  419. return `${base}
  420. Target file: ${target.filePath}
  421. Generate the .vdb file with realistic seed data for the declared tables. Keep relations simple and consistent with the contract.`;
  422. }
  423. if (kind === 'theme') {
  424. return `${base}
  425. Target file: ${target.filePath}
  426. Latest Theme 6.5 reference:
  427. <theme-doc>
  428. ${latestThemeDoc}
  429. </theme-doc>
  430. Generate a custom enterprise-light theme for CampusOps. It must keep base_theme:"Platform/Theme-Default-Light@1", use # Point Slot Values, and visibly reflect the contract palette.`;
  431. }
  432. if (kind === 'service') {
  433. return `${base}
  434. Target file: ${target.filePath}
  435. Service contract:
  436. ${JSON.stringify(target, null, 2)}
  437. Generate one ServiceDomain file with virtual tables or direct table access as needed. Keep query filters safe: skip optional filters when empty, 0, or -1.`;
  438. }
  439. if (kind === 'component') {
  440. return `${base}
  441. Target file: ${target.filePath}
  442. Component contract:
  443. ${JSON.stringify(target, null, 2)}
  444. Generate one reusable pure UI component.`;
  445. }
  446. if (kind === 'section') {
  447. return `${base}
  448. Target file: ${target.filePath}
  449. Section contract:
  450. ${JSON.stringify(target, null, 2)}
  451. Available services:
  452. ${JSON.stringify(DIRECT_CONTRACT.services, null, 2)}
  453. Available components:
  454. ${JSON.stringify(DIRECT_CONTRACT.components, null, 2)}
  455. Generate one Section file with local state, service calls, and simple event handlers.`;
  456. }
  457. if (kind === 'app') {
  458. return `${base}
  459. Target file: ${target.filePath}
  460. App contract:
  461. ${JSON.stringify(target, null, 2)}
  462. Available sections:
  463. ${JSON.stringify(DIRECT_CONTRACT.sections, null, 2)}
  464. Generate one App file that routes to the declared sections and uses a clear sidebar + content layout.`;
  465. }
  466. throw new Error(`Unknown direct generation kind: ${kind}`);
  467. }
  468. async function generateSingleDirectFile({ projectDir, kind, target, latestThemeDoc }) {
  469. const prompt = buildDirectFilePrompt({ kind, target, latestThemeDoc });
  470. const safeName = path.basename(target.filePath).replace(/[^\w.-]/g, '_');
  471. const promptDir = path.join(projectDir, 'Process', 'DirectGeneration');
  472. await fs.mkdir(promptDir, { recursive: true });
  473. await fs.writeFile(path.join(promptDir, `${safeName}.prompt.txt`), prompt, 'utf-8');
  474. const startedAt = performance.now();
  475. const raw = await runClaudePrompt(prompt, {
  476. systemPrompt: `Generate only the VL source for ${target.filePath}.`,
  477. timeoutMs: 8 * 60 * 1000,
  478. });
  479. const durationMs = Math.round(performance.now() - startedAt);
  480. await fs.writeFile(path.join(promptDir, `${safeName}.raw.txt`), raw, 'utf-8');
  481. const targetPath = path.join(projectDir, target.filePath);
  482. await fs.mkdir(path.dirname(targetPath), { recursive: true });
  483. await fs.writeFile(targetPath, raw.trim() + '\n', 'utf-8');
  484. return {
  485. filePath: target.filePath,
  486. kind,
  487. durationMs,
  488. };
  489. }
  490. async function runDirectBaseline({ projectDir, latestThemeDoc, latestVlVersion }) {
  491. await ensureProjectScaffold(projectDir);
  492. const contextPath = path.join(projectDir, 'Process', 'DirectContext.json');
  493. await fs.writeFile(contextPath, JSON.stringify({
  494. latestVlVersion,
  495. contract: DIRECT_CONTRACT,
  496. requiredPaths: DIRECT_REQUIRED_PATHS,
  497. }, null, 2), 'utf-8');
  498. const plan = [
  499. { kind: 'database', target: { filePath: DIRECT_CONTRACT.dataModel.filePath } },
  500. { kind: 'theme', target: { filePath: DIRECT_CONTRACT.theme.filePath } },
  501. ];
  502. const servicePlan = DIRECT_CONTRACT.services.map((service) => ({ kind: 'service', target: service }));
  503. const componentPlan = DIRECT_CONTRACT.components.map((component) => ({ kind: 'component', target: component }));
  504. const sectionPlan = DIRECT_CONTRACT.sections.map((section) => ({ kind: 'section', target: section }));
  505. const appPlan = [{ kind: 'app', target: DIRECT_CONTRACT.app }];
  506. const startedAt = performance.now();
  507. const fileStats = [];
  508. const phase1 = await mapLimit(plan, 2, (item) => generateSingleDirectFile({
  509. projectDir,
  510. latestThemeDoc,
  511. ...item,
  512. }));
  513. fileStats.push(...phase1);
  514. const phase2 = await mapLimit([...servicePlan, ...componentPlan], 4, (item) => generateSingleDirectFile({
  515. projectDir,
  516. latestThemeDoc,
  517. ...item,
  518. }));
  519. fileStats.push(...phase2);
  520. const phase3 = await mapLimit(sectionPlan, 3, (item) => generateSingleDirectFile({
  521. projectDir,
  522. latestThemeDoc,
  523. ...item,
  524. }));
  525. fileStats.push(...phase3);
  526. const phase4 = await mapLimit(appPlan, 1, (item) => generateSingleDirectFile({
  527. projectDir,
  528. latestThemeDoc,
  529. ...item,
  530. }));
  531. fileStats.push(...phase4);
  532. const durationMs = Math.round(performance.now() - startedAt);
  533. const actualPaths = fileStats.map((item) => item.filePath).sort();
  534. const missing = DIRECT_REQUIRED_PATHS.filter((relPath) => !actualPaths.includes(relPath));
  535. const extra = actualPaths.filter((relPath) => !DIRECT_REQUIRED_PATHS.includes(relPath));
  536. return {
  537. durationMs,
  538. firstArtifactMs: phase1.length ? Math.min(...phase1.map((item) => item.durationMs)) : null,
  539. missingPaths: missing,
  540. extraPaths: extra.sort(),
  541. writtenPaths: actualPaths,
  542. declaredProjectName: DIRECT_CONTRACT.projectName,
  543. fileStats,
  544. };
  545. }
  546. async function runWorkflowBaseline({ projectDir, cookie }) {
  547. await ensureProjectScaffold(projectDir);
  548. const workflow = JSON.parse(await fs.readFile(WORKFLOW_PATH, 'utf-8'));
  549. const executor = new WorkflowExecutor({
  550. workDir: projectDir,
  551. model: MODEL,
  552. llmProvider: 'cli',
  553. cookie,
  554. });
  555. const timeline = [];
  556. const logLines = [];
  557. const fileEvents = [];
  558. const startedAt = performance.now();
  559. await new Promise((resolve, reject) => {
  560. executor.execute(workflow, {
  561. userRequest: APP_REQUIREMENT,
  562. targetLang: TARGET_LANG,
  563. }, {
  564. onText: (text) => {
  565. const msg = String(text || '').trim();
  566. if (msg) logLines.push(msg);
  567. },
  568. onNodeStart: (info) => {
  569. timeline.push({
  570. nodeId: info.nodeId,
  571. title: info.title,
  572. type: info.type,
  573. status: 'start',
  574. at: new Date().toISOString(),
  575. });
  576. },
  577. onNodeDone: (info) => {
  578. timeline.push({
  579. nodeId: info.nodeId,
  580. title: info.title,
  581. type: info.type,
  582. status: 'done',
  583. at: new Date().toISOString(),
  584. duration_ms: info.duration_ms || 0,
  585. });
  586. },
  587. onNodeError: (info) => {
  588. timeline.push({
  589. nodeId: info.nodeId,
  590. title: info.title,
  591. type: info.type,
  592. status: 'error',
  593. at: new Date().toISOString(),
  594. error: info.error || 'unknown error',
  595. });
  596. },
  597. onFileWritten: (filePath) => {
  598. fileEvents.push({
  599. filePath,
  600. atMs: Math.round(performance.now() - startedAt),
  601. });
  602. },
  603. onDone: (info) => resolve(info),
  604. onError: (message) => reject(new Error(message || 'workflow generation failed')),
  605. });
  606. });
  607. const durationMs = Math.round(performance.now() - startedAt);
  608. await fs.writeFile(path.join(projectDir, 'Process', 'WorkflowTimeline.json'), JSON.stringify(timeline, null, 2), 'utf-8');
  609. await fs.writeFile(path.join(projectDir, 'Process', 'WorkflowLog.txt'), logLines.join('\n'), 'utf-8');
  610. await fs.writeFile(path.join(projectDir, 'Process', 'WorkflowFileEvents.json'), JSON.stringify(fileEvents, null, 2), 'utf-8');
  611. return {
  612. durationMs,
  613. timeline,
  614. firstArtifactMs: fileEvents.length ? Math.min(...fileEvents.map((item) => item.atMs)) : null,
  615. };
  616. }
  617. function parseValidationSummary(validationText) {
  618. const text = String(validationText || '');
  619. if (/All \d+ VL files passed validation\./.test(text)) {
  620. return { errors: 0, warnings: 0, raw: text };
  621. }
  622. const match = text.match(/Validation:\s+(\d+)\s+errors,\s+(\d+)\s+warnings/i);
  623. return {
  624. errors: match ? Number(match[1]) : null,
  625. warnings: match ? Number(match[2]) : null,
  626. raw: text,
  627. };
  628. }
  629. async function collectFileContents(projectDir) {
  630. const result = {};
  631. async function walk(currentDir, prefix = '') {
  632. const entries = await fs.readdir(currentDir, { withFileTypes: true });
  633. for (const entry of entries) {
  634. if (entry.name.startsWith('.')) {
  635. if (entry.name !== '.vl-code') continue;
  636. }
  637. const fullPath = path.join(currentDir, entry.name);
  638. const relPath = prefix ? `${prefix}/${entry.name}` : entry.name;
  639. if (entry.isDirectory()) {
  640. await walk(fullPath, relPath);
  641. } else if (/\.(vx|sc|cp|vs|vdb|vth|json|txt|md)$/i.test(entry.name)) {
  642. try {
  643. result[relPath] = await fs.readFile(fullPath, 'utf-8');
  644. } catch {}
  645. }
  646. }
  647. }
  648. await walk(projectDir);
  649. return result;
  650. }
  651. function countLocByExt(fileMap) {
  652. const stats = {};
  653. for (const [relPath, content] of Object.entries(fileMap || {})) {
  654. const ext = path.extname(relPath) || 'none';
  655. const lines = String(content || '').split('\n');
  656. if (!stats[ext]) stats[ext] = { files: 0, lines: 0, nonEmptyLines: 0 };
  657. stats[ext].files += 1;
  658. stats[ext].lines += lines.length;
  659. stats[ext].nonEmptyLines += lines.filter((line) => line.trim()).length;
  660. }
  661. return stats;
  662. }
  663. function summarizeTheme(themeContent) {
  664. const lines = String(themeContent || '').split('\n');
  665. const assignmentCount = lines.filter((line) => {
  666. const trimmed = line.trim();
  667. return trimmed && !trimmed.startsWith('//') && !trimmed.startsWith('#') && !trimmed.startsWith('<') && trimmed.includes(':');
  668. }).length;
  669. return {
  670. lines: lines.length,
  671. assignmentCount,
  672. hasPointSlotValues: /# Point Slot Values/.test(themeContent || ''),
  673. hasDesignTokens: /# Design Tokens/.test(themeContent || ''),
  674. hasBaseTheme: /base_theme:"Platform\/Theme-Default-Light@1"/.test(themeContent || ''),
  675. };
  676. }
  677. async function compileProject(projectDir, cookie) {
  678. const zipPath = path.join(projectDir, '__benchmark_compile.zip');
  679. const bodyCookie = normalizeCookie(cookie);
  680. try {
  681. execSync(
  682. `cd "${projectDir}" && find . -type f \\( -name "*.vx" -o -name "*.sc" -o -name "*.cp" -o -name "*.vs" -o -name "*.vdb" -o -name "*.vth" \\) | zip -q -@ "${zipPath}"`,
  683. { timeout: 30_000 },
  684. );
  685. const zipBuffer = await fs.readFile(zipPath);
  686. const dataUrl = `data:application/zip;base64,${zipBuffer.toString('base64')}`;
  687. const res = await fetch(`${PARSEVL_URL}/edtfn/parsevl`, {
  688. method: 'POST',
  689. headers: {
  690. 'Content-Type': 'application/json',
  691. 'Cookie': bodyCookie,
  692. },
  693. body: JSON.stringify({
  694. action: 'parsePjt',
  695. file: dataUrl,
  696. download: true,
  697. projectName: path.basename(projectDir),
  698. }),
  699. });
  700. const json = await res.json();
  701. const diagnostics = Array.isArray(json?.data?.errList) ? json.data.errList : [];
  702. const hardErrors = diagnostics.filter((item) => String(item?.level || '').toLowerCase() !== 'warning');
  703. const warnings = diagnostics.filter((item) => String(item?.level || '').toLowerCase() === 'warning');
  704. return {
  705. httpOk: res.ok,
  706. code: json?.code ?? null,
  707. success: res.ok && (json?.code === 0 || json?.code === 200) && hardErrors.length === 0,
  708. message: json?.message || null,
  709. errCount: hardErrors.length,
  710. warningCount: warnings.length,
  711. previewUrls: json?.data?.appPreviewUrlMap || {},
  712. errList: diagnostics.slice(0, 20),
  713. raw: json,
  714. };
  715. } catch (err) {
  716. return {
  717. success: false,
  718. error: err.message,
  719. errCount: null,
  720. warningCount: null,
  721. previewUrls: {},
  722. errList: [],
  723. };
  724. } finally {
  725. try {
  726. await fs.unlink(zipPath);
  727. } catch {}
  728. }
  729. }
  730. async function analyzeProject(projectDir, generationMeta) {
  731. const ctx = new VLProjectContext(projectDir);
  732. await ctx.load();
  733. const validateTool = createVLValidateTool(ctx);
  734. const validationText = await validateTool.execute({ file_path: 'all' });
  735. const validation = parseValidationSummary(validationText);
  736. const fileMap = await collectFileContents(projectDir);
  737. const vlFileMap = Object.fromEntries(
  738. Object.entries(fileMap).filter(([relPath]) => /\.(vx|sc|cp|vs|vdb|vth)$/i.test(relPath))
  739. );
  740. const metadata = extractFromFileTree(vlFileMap, projectDir);
  741. const metaValidation = validateMeta(metadata);
  742. const compile = await compileProject(projectDir, generationMeta.cookie);
  743. await fs.writeFile(path.join(projectDir, 'Process', 'BenchmarkValidation.txt'), validation.raw || '', 'utf-8');
  744. await fs.writeFile(path.join(projectDir, 'Process', 'BenchmarkCompile.json'), JSON.stringify(compile, null, 2), 'utf-8');
  745. if (metadata) {
  746. await fs.writeFile(path.join(projectDir, 'Process', 'BenchmarkExtractedMeta.json'), JSON.stringify(metadata, null, 2), 'utf-8');
  747. }
  748. const themeFile = Object.keys(vlFileMap).find((relPath) => relPath.endsWith('.vth'));
  749. return {
  750. projectDir,
  751. fileCount: Object.keys(vlFileMap).length,
  752. locByExt: countLocByExt(vlFileMap),
  753. validation,
  754. metadataValid: metaValidation.valid,
  755. metadataIssues: metaValidation.issues,
  756. compile,
  757. theme: summarizeTheme(themeFile ? vlFileMap[themeFile] : ''),
  758. extractedMeta: metadata
  759. ? {
  760. projectName: metadata.projectName || null,
  761. services: Array.isArray(metadata.services) ? metadata.services.length : 0,
  762. components: Array.isArray(metadata.components) ? metadata.components.length : 0,
  763. sections: Array.isArray(metadata.sections) ? metadata.sections.length : 0,
  764. apps: Array.isArray(metadata.apps) ? metadata.apps.length : 0,
  765. tables: Array.isArray(metadata.dataSchema?.tables) ? metadata.dataSchema.tables.length : 0,
  766. }
  767. : null,
  768. };
  769. }
  770. function summarizePathMetrics(label, runMeta, analysis) {
  771. return {
  772. label,
  773. durationMs: runMeta.durationMs,
  774. firstArtifactMs: runMeta.firstArtifactMs ?? null,
  775. fileCount: analysis.fileCount,
  776. validationErrors: analysis.validation.errors,
  777. validationWarnings: analysis.validation.warnings,
  778. metadataValid: analysis.metadataValid,
  779. metadataIssueCount: analysis.metadataIssues.length,
  780. compileSuccess: analysis.compile.success,
  781. compileErrors: analysis.compile.errCount,
  782. compileWarnings: analysis.compile.warningCount,
  783. previewCount: Object.keys(analysis.compile.previewUrls || {}).length,
  784. themeAssignments: analysis.theme.assignmentCount,
  785. themeLines: analysis.theme.lines,
  786. };
  787. }
  788. function buildMarkdownReport(context) {
  789. const { docs, directRun, workflowRun, directAnalysis, workflowAnalysis } = context;
  790. const directSummary = summarizePathMetrics('Direct', directRun, directAnalysis);
  791. const workflowSummary = summarizePathMetrics('Workflow', workflowRun, workflowAnalysis);
  792. return [
  793. `# LLM Codegen Benchmark (${TODAY})`,
  794. '',
  795. '## Scenario',
  796. '',
  797. `- App: CampusOps`,
  798. `- Model: ${MODEL}`,
  799. `- Target language: ${TARGET_LANG}`,
  800. `- Latest DocCenter VL syntax version observed: ${docs.latestVlVersion || 'unknown'}`,
  801. '- Latest Theme reference: Theme-Enterprise-6.5',
  802. '- Workflow baseline: VLCode-Lite 6-file codegen workflow',
  803. '',
  804. '## Requirement',
  805. '',
  806. APP_REQUIREMENT,
  807. '',
  808. '## Summary',
  809. '',
  810. `- Direct baseline project: ${directAnalysis.projectDir}`,
  811. `- Workflow baseline project: ${workflowAnalysis.projectDir}`,
  812. `- Direct duration: ${directSummary.durationMs} ms`,
  813. `- Workflow duration: ${workflowSummary.durationMs} ms`,
  814. `- Direct compile success: ${directSummary.compileSuccess}`,
  815. `- Workflow compile success: ${workflowSummary.compileSuccess}`,
  816. '',
  817. '## Direct',
  818. '',
  819. '```json',
  820. JSON.stringify(directSummary, null, 2),
  821. '```',
  822. '',
  823. '## Workflow',
  824. '',
  825. '```json',
  826. JSON.stringify(workflowSummary, null, 2),
  827. '```',
  828. '',
  829. '## Notes',
  830. '',
  831. `- Direct missing paths: ${directRun.missingPaths.join(', ') || 'none'}`,
  832. `- Direct extra paths: ${directRun.extraPaths.join(', ') || 'none'}`,
  833. `- Direct metadata issues: ${directAnalysis.metadataIssues.join(' | ') || 'none'}`,
  834. `- Workflow metadata issues: ${workflowAnalysis.metadataIssues.join(' | ') || 'none'}`,
  835. ].join('\n');
  836. }
  837. async function main() {
  838. await fs.mkdir(TEST_ROOT, { recursive: true });
  839. await fs.mkdir(REPORT_DIR, { recursive: true });
  840. const cookie = getCookie({ workDir: process.cwd(), cookie: '' });
  841. if (!cookie) {
  842. throw new Error('No DocCenter/cloud cookie found. Cannot run authenticated benchmark.');
  843. }
  844. console.log(`[benchmark] fetching latest docs from DocCenter`);
  845. const latestVlDoc = await fetchDoc(1, cookie);
  846. const latestThemeDoc = await fetchDoc(4, cookie);
  847. const latestVlVersion = latestVlDoc.match(/Current version:\s*`\/\/\s*VL_VERSION:([^`]+)`/i)?.[1]?.trim() || null;
  848. const directProjectName = projectNameWithFallback(`CampusOpsDirect${slugDate(TODAY)}Test`);
  849. const workflowProjectName = projectNameWithFallback(`CampusOpsWorkflow${slugDate(TODAY)}Test`);
  850. const directProjectDir = path.join(TEST_ROOT, directProjectName);
  851. const workflowProjectDir = path.join(TEST_ROOT, workflowProjectName);
  852. console.log(`[benchmark] direct baseline -> ${directProjectDir}`);
  853. const directRun = await runDirectBaseline({
  854. projectDir: directProjectDir,
  855. latestThemeDoc,
  856. latestVlVersion,
  857. });
  858. console.log(`[benchmark] workflow baseline -> ${workflowProjectDir}`);
  859. const workflowRun = await runWorkflowBaseline({
  860. projectDir: workflowProjectDir,
  861. cookie,
  862. });
  863. console.log('[benchmark] analyzing direct baseline');
  864. const directAnalysis = await analyzeProject(directProjectDir, { cookie });
  865. console.log('[benchmark] analyzing workflow baseline');
  866. const workflowAnalysis = await analyzeProject(workflowProjectDir, { cookie });
  867. const report = {
  868. createdAt: new Date().toISOString(),
  869. model: MODEL,
  870. latestDocs: {
  871. latestVlVersion,
  872. themeName: 'Theme-Enterprise-6.5',
  873. themeLength: latestThemeDoc.length,
  874. },
  875. requirement: APP_REQUIREMENT,
  876. direct: {
  877. run: directRun,
  878. analysis: directAnalysis,
  879. summary: summarizePathMetrics('Direct', directRun, directAnalysis),
  880. },
  881. workflow: {
  882. run: workflowRun,
  883. analysis: workflowAnalysis,
  884. summary: summarizePathMetrics('Workflow', workflowRun, workflowAnalysis),
  885. },
  886. };
  887. const reportJsonPath = path.join(REPORT_DIR, `llm-codegen-benchmark-${TODAY}.json`);
  888. const reportMdPath = path.join(REPORT_DIR, `llm-codegen-benchmark-${TODAY}.md`);
  889. await fs.writeFile(reportJsonPath, JSON.stringify(report, null, 2), 'utf-8');
  890. await fs.writeFile(reportMdPath, buildMarkdownReport({
  891. docs: report.latestDocs,
  892. directRun,
  893. workflowRun,
  894. directAnalysis,
  895. workflowAnalysis,
  896. }), 'utf-8');
  897. console.log('[benchmark] complete');
  898. console.log(JSON.stringify({
  899. reportJsonPath,
  900. reportMdPath,
  901. directProjectDir,
  902. workflowProjectDir,
  903. directSummary: report.direct.summary,
  904. workflowSummary: report.workflow.summary,
  905. }, null, 2));
  906. }
  907. main().catch((err) => {
  908. console.error('[benchmark] failed:', err.message);
  909. process.exitCode = 1;
  910. });