Skip to content

Commit df47768

Browse files
committed
feat: add config option to override categories
Adds a config option that allows users to override the config for a category.
1 parent 906ee4d commit df47768

File tree

6 files changed

+81
-43
lines changed

6 files changed

+81
-43
lines changed

runner/configuration/environment-config.ts

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import z from 'zod';
22
import {createMessageBuilder, fromError} from 'zod-validation-error/v3';
33
import {UserFacingError} from '../utils/errors.js';
4-
import {ratingOverrideSchema, ratingSchema} from '../ratings/rating-types.js';
4+
import {RatingCategory, ratingOverrideSchema, ratingSchema} from '../ratings/rating-types.js';
55
import {EvalPrompt, EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js';
66
import {executorSchema} from '../orchestration/executors/executor.js';
77
import {
@@ -77,6 +77,20 @@ export const environmentConfigSchema = z.object({
7777
'Executor to be used for this environment. ' +
7878
'If unset, a local executor is derived from the full environment configuration.',
7979
),
80+
81+
/**
82+
* Map used to override fields for specific rating categories. The key is the unique ID of
83+
* the category and the value are the override fields.
84+
*/
85+
categoryOverrides: z
86+
.record(
87+
z.custom<RatingCategory>(),
88+
z.object({
89+
name: z.string().optional(),
90+
maxPoints: z.number().optional(),
91+
}),
92+
)
93+
.optional(),
8094
});
8195

8296
/**

runner/configuration/environment.ts

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import {readdirSync, readFileSync, statSync} from 'fs';
22
import {basename, extname, join, resolve} from 'path';
33
import {globSync} from 'tinyglobby';
44
import {Executor} from '../orchestration/executors/executor.js';
5-
import {Rating} from '../ratings/rating-types.js';
5+
import {Rating, RatingCategory} from '../ratings/rating-types.js';
66
import {
77
FrameworkInfo,
88
MultiStepPromptDefinition,
@@ -38,6 +38,12 @@ export class Environment {
3838
readonly executor: Executor;
3939
/** Timeout for a single eval prompt in minutes. */
4040
readonly promptTimeoutMinutes: number | undefined;
41+
/** Configuration for the individual rating categories. */
42+
readonly ratingCategories: {
43+
[RatingCategory.HIGH_IMPACT]: {name: string; maxPoints: number};
44+
[RatingCategory.MEDIUM_IMPACT]: {name: string; maxPoints: number};
45+
[RatingCategory.LOW_IMPACT]: {name: string; maxPoints: number};
46+
};
4147

4248
constructor(
4349
rootPath: string,
@@ -65,6 +71,7 @@ export class Environment {
6571
this.isBuiltIn = rootPath.includes('node_modules');
6672
this.executor = config.executor;
6773
this.promptTimeoutMinutes = config.promptTimeoutMinutes;
74+
this.ratingCategories = this.getRatingCategories(config);
6875
}
6976

7077
/** Prompts that should be executed as a part of the evaluation. */
@@ -370,4 +377,26 @@ export class Environment {
370377

371378
return result;
372379
}
380+
381+
private getRatingCategories(config: EnvironmentConfig) {
382+
const overrides = config.categoryOverrides;
383+
384+
return {
385+
[RatingCategory.HIGH_IMPACT]: {
386+
name: 'High Impact',
387+
maxPoints: 60,
388+
...overrides?.[RatingCategory.HIGH_IMPACT],
389+
},
390+
[RatingCategory.MEDIUM_IMPACT]: {
391+
name: 'Medium Impact',
392+
maxPoints: 30,
393+
...overrides?.[RatingCategory.MEDIUM_IMPACT],
394+
},
395+
[RatingCategory.LOW_IMPACT]: {
396+
name: 'Low Impact',
397+
maxPoints: 10,
398+
...overrides?.[RatingCategory.LOW_IMPACT],
399+
},
400+
};
401+
}
373402
}

runner/ratings/rate-code.ts

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,7 @@ import {
1919
PerFileRatingContentType,
2020
RatingKind,
2121
RatingCategory,
22-
POINTS_FOR_CATEGORIES,
2322
Rating,
24-
CATEGORY_NAMES,
2523
RatingsResult,
2624
} from './rating-types.js';
2725
import {extractEmbeddedCodeFromTypeScript} from './embedded-languages.js';
@@ -82,10 +80,9 @@ export async function rateGeneratedCode(
8280
RatingCategory.MEDIUM_IMPACT,
8381
RatingCategory.LOW_IMPACT,
8482
].map(category => ({
83+
...environment.ratingCategories[category],
8584
id: category,
86-
name: CATEGORY_NAMES[category],
8785
points: 0,
88-
maxPoints: POINTS_FOR_CATEGORIES[category],
8986
assessments: [],
9087
}));
9188

runner/ratings/rating-types.ts

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,20 +32,6 @@ export enum RatingCategory {
3232
LOW_IMPACT = 'low-impact',
3333
}
3434

35-
/** Points correspond to each `RatingCategory`. */
36-
export const POINTS_FOR_CATEGORIES = {
37-
[RatingCategory.HIGH_IMPACT]: 60,
38-
[RatingCategory.MEDIUM_IMPACT]: 30,
39-
[RatingCategory.LOW_IMPACT]: 10,
40-
};
41-
42-
/** Display names for each `RatingCategory`. */
43-
export const CATEGORY_NAMES = {
44-
[RatingCategory.HIGH_IMPACT]: 'High Impact',
45-
[RatingCategory.MEDIUM_IMPACT]: 'Medium Impact',
46-
[RatingCategory.LOW_IMPACT]: 'Low Impact',
47-
};
48-
4935
const ratingCommonContextFields = {
5036
ratingsResult: z.record(z.custom<IndividualAssessment | SkippedIndividualAssessment>()),
5137
prompt: z.custom<PromptDefinition>(),

runner/reporting/report-ai-chat.ts

Lines changed: 34 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,27 +12,6 @@ import {
1212
} from '../shared-interfaces.js';
1313
import {BuildResultStatus} from '../workers/builder/builder-types.js';
1414
import {BUCKET_CONFIG} from '../ratings/stats.js';
15-
import {POINTS_FOR_CATEGORIES} from '../ratings/rating-types.js';
16-
17-
export const reportLlmEvalsToolContext = `## What is a report?
18-
A report consists of many apps that were LLM generated. You will have information
19-
about checks that failed for this LLM generated app.
20-
21-
Note that there may be multiple attempts for an app. E.g. an initial build may fail and
22-
another attempt might have repaired the build failure. The last attempt reflects the final
23-
state of the app. E.g. whether it does build, or if there are runtime errors.
24-
25-
## Scoring mechanism
26-
Apps are rated based on their scores in the following buckets:
27-
${BUCKET_CONFIG.map(b => `* ${b.name}: ${b.min}-${b.max}`).join('\n')}
28-
29-
The overall score of an app is determined based on score reductions.
30-
There are three pillars: ${Object.keys(POINTS_FOR_CATEGORIES).join(', ')}
31-
Pillars are a split up of a 100% perfect score, allowing for individual ratings
32-
to be less impactful than others. The pillars are distributed as follows:
33-
${Object.entries(POINTS_FOR_CATEGORIES).map(e => `* ${e[0]}: ${e[1]} points.`)}
34-
Within pillars, the available score can be reduced by individual ratings.
35-
`;
3615

3716
const defaultAiChatPrompt = `Strictly follow the instructions here.
3817
- You are an expert in LLM-based code generation evaluation and quality assessments.
@@ -90,7 +69,7 @@ export async function chatWithReportAI(
9069
${message}
9170
\`\`\`
9271
93-
${reportLlmEvalsToolContext}
72+
${getContextPrompt(assessmentsToProcess)}
9473
9574
### How many apps are there?
9675
There are ${allAssessments.length} apps in this report.
@@ -193,3 +172,36 @@ function isAssessmentResultWithID(
193172
): value is AssessmentResultFromReportServer {
194173
return (value as Partial<AssessmentResultFromReportServer>).id !== undefined;
195174
}
175+
176+
function getContextPrompt(assessments: AssessmentResultFromReportServer[] | AssessmentResult[]) {
177+
let categoryCount = 0;
178+
let pointsForCategories = {} as Record<string, number>;
179+
180+
// Deduce the categories from the first result since they're the same for the entire run.
181+
if (assessments.length) {
182+
assessments[0].score.categories.forEach(category => {
183+
categoryCount++;
184+
pointsForCategories[category.id] = category.maxPoints;
185+
});
186+
}
187+
188+
return `## What is a report?
189+
A report consists of many apps that were LLM generated. You will have information
190+
about checks that failed for this LLM generated app.
191+
192+
Note that there may be multiple attempts for an app. E.g. an initial build may fail and
193+
another attempt might have repaired the build failure. The last attempt reflects the final
194+
state of the app. E.g. whether it does build, or if there are runtime errors.
195+
196+
## Scoring mechanism
197+
Apps are rated based on their scores in the following buckets:
198+
${BUCKET_CONFIG.map(b => `* ${b.name}: ${b.min}-${b.max}`).join('\n')}
199+
200+
The overall score of an app is determined based on score reductions.
201+
There are ${categoryCount} pillars: ${Object.keys(pointsForCategories).join(', ')}
202+
Pillars are a split up of a 100% perfect score, allowing for individual ratings
203+
to be less impactful than others. The pillars are distributed as follows:
204+
${Object.entries(pointsForCategories).map(e => `* ${e[0]}: ${e[1]} points.`)}
205+
Within pillars, the available score can be reduced by individual ratings.
206+
`;
207+
}

runner/shared-interfaces.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ export interface LlmContextFile {
174174
export interface AssessmentCategory {
175175
/** Unique ID of the category. */
176176
id: RatingCategory;
177-
/** Display name of the cateogry. */
177+
/** Display name of the category. */
178178
name: string;
179179
/** Points that have been awarded to the category. */
180180
points: number;

0 commit comments

Comments
 (0)