@@ -12,27 +12,6 @@ import {
1212} from '../shared-interfaces.js' ;
1313import { BuildResultStatus } from '../workers/builder/builder-types.js' ;
1414import { BUCKET_CONFIG } from '../ratings/stats.js' ;
15- import { POINTS_FOR_CATEGORIES } from '../ratings/rating-types.js' ;
16-
17- export const reportLlmEvalsToolContext = `## What is a report?
18- A report consists of many apps that were LLM generated. You will have information
19- about checks that failed for this LLM generated app.
20-
21- Note that there may be multiple attempts for an app. E.g. an initial build may fail and
22- another attempt might have repaired the build failure. The last attempt reflects the final
23- state of the app. E.g. whether it does build, or if there are runtime errors.
24-
25- ## Scoring mechanism
26- Apps are rated based on their scores in the following buckets:
27- ${ BUCKET_CONFIG . map ( b => `* ${ b . name } : ${ b . min } -${ b . max } ` ) . join ( '\n' ) }
28-
29- The overall score of an app is determined based on score reductions.
30- There are three pillars: ${ Object . keys ( POINTS_FOR_CATEGORIES ) . join ( ', ' ) }
31- Pillars are a split up of a 100% perfect score, allowing for individual ratings
32- to be less impactful than others. The pillars are distributed as follows:
33- ${ Object . entries ( POINTS_FOR_CATEGORIES ) . map ( e => `* ${ e [ 0 ] } : ${ e [ 1 ] } points.` ) }
34- Within pillars, the available score can be reduced by individual ratings.
35- ` ;
3615
3716const defaultAiChatPrompt = `Strictly follow the instructions here.
3817- You are an expert in LLM-based code generation evaluation and quality assessments.
@@ -90,7 +69,7 @@ export async function chatWithReportAI(
9069${ message }
9170\`\`\`
9271
93- ${ reportLlmEvalsToolContext }
72+ ${ getContextPrompt ( assessmentsToProcess ) }
9473
9574### How many apps are there?
9675There are ${ allAssessments . length } apps in this report.
@@ -193,3 +172,36 @@ function isAssessmentResultWithID(
193172) : value is AssessmentResultFromReportServer {
194173 return ( value as Partial < AssessmentResultFromReportServer > ) . id !== undefined ;
195174}
175+
176+ function getContextPrompt ( assessments : AssessmentResultFromReportServer [ ] | AssessmentResult [ ] ) {
177+ let categoryCount = 0 ;
178+ let pointsForCategories = { } as Record < string , number > ;
179+
180+ // Deduce the categories from the first result since they're the same for the entire run.
181+ if ( assessments . length ) {
182+ assessments [ 0 ] . score . categories . forEach ( category => {
183+ categoryCount ++ ;
184+ pointsForCategories [ category . id ] = category . maxPoints ;
185+ } ) ;
186+ }
187+
188+ return `## What is a report?
189+ A report consists of many apps that were LLM generated. You will have information
190+ about checks that failed for this LLM generated app.
191+
192+ Note that there may be multiple attempts for an app. E.g. an initial build may fail and
193+ another attempt might have repaired the build failure. The last attempt reflects the final
194+ state of the app. E.g. whether it does build, or if there are runtime errors.
195+
196+ ## Scoring mechanism
197+ Apps are rated based on their scores in the following buckets:
198+ ${ BUCKET_CONFIG . map ( b => `* ${ b . name } : ${ b . min } -${ b . max } ` ) . join ( '\n' ) }
199+
200+ The overall score of an app is determined based on score reductions.
201+ There are ${ categoryCount } pillars: ${ Object . keys ( pointsForCategories ) . join ( ', ' ) }
202+ Pillars are a split up of a 100% perfect score, allowing for individual ratings
203+ to be less impactful than others. The pillars are distributed as follows:
204+ ${ Object . entries ( pointsForCategories ) . map ( e => `* ${ e [ 0 ] } : ${ e [ 1 ] } points.` ) }
205+ Within pillars, the available score can be reduced by individual ratings.
206+ ` ;
207+ }
0 commit comments