feat: add config option to override categories

crisbeto · crisbeto · commit df477682742b · 2025-12-04T11:16:37.000+01:00
Adds a config option that allows users to override the config for a category.
diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts
@@ -1,7 +1,7 @@
 import z from 'zod';
 import {createMessageBuilder, fromError} from 'zod-validation-error/v3';
 import {UserFacingError} from '../utils/errors.js';
-import {ratingOverrideSchema, ratingSchema} from '../ratings/rating-types.js';
+import {RatingCategory, ratingOverrideSchema, ratingSchema} from '../ratings/rating-types.js';
 import {EvalPrompt, EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js';
 import {executorSchema} from '../orchestration/executors/executor.js';
 import {
@@ -77,6 +77,20 @@ export const environmentConfigSchema = z.object({
       'Executor to be used for this environment. ' +
         'If unset, a local executor is derived from the full environment configuration.',
     ),
+
+  /**
+   * Map used to override fields for specific rating categories. The key is the unique ID of
+   * the category and the value are the override fields.
+   */
+  categoryOverrides: z
+    .record(
+      z.custom<RatingCategory>(),
+      z.object({
+        name: z.string().optional(),
+        maxPoints: z.number().optional(),
+      }),
+    )
+    .optional(),
 });
 
 /**
diff --git a/runner/configuration/environment.ts b/runner/configuration/environment.ts
@@ -2,7 +2,7 @@ import {readdirSync, readFileSync, statSync} from 'fs';
 import {basename, extname, join, resolve} from 'path';
 import {globSync} from 'tinyglobby';
 import {Executor} from '../orchestration/executors/executor.js';
-import {Rating} from '../ratings/rating-types.js';
+import {Rating, RatingCategory} from '../ratings/rating-types.js';
 import {
   FrameworkInfo,
   MultiStepPromptDefinition,
@@ -38,6 +38,12 @@ export class Environment {
   readonly executor: Executor;
   /** Timeout for a single eval prompt in minutes. */
   readonly promptTimeoutMinutes: number | undefined;
+  /** Configuration for the individual rating categories. */
+  readonly ratingCategories: {
+    [RatingCategory.HIGH_IMPACT]: {name: string; maxPoints: number};
+    [RatingCategory.MEDIUM_IMPACT]: {name: string; maxPoints: number};
+    [RatingCategory.LOW_IMPACT]: {name: string; maxPoints: number};
+  };
 
   constructor(
     rootPath: string,
@@ -65,6 +71,7 @@ export class Environment {
     this.isBuiltIn = rootPath.includes('node_modules');
     this.executor = config.executor;
     this.promptTimeoutMinutes = config.promptTimeoutMinutes;
+    this.ratingCategories = this.getRatingCategories(config);
   }
 
   /** Prompts that should be executed as a part of the evaluation. */
@@ -370,4 +377,26 @@ export class Environment {
 
     return result;
   }
+
+  private getRatingCategories(config: EnvironmentConfig) {
+    const overrides = config.categoryOverrides;
+
+    return {
+      [RatingCategory.HIGH_IMPACT]: {
+        name: 'High Impact',
+        maxPoints: 60,
+        ...overrides?.[RatingCategory.HIGH_IMPACT],
+      },
+      [RatingCategory.MEDIUM_IMPACT]: {
+        name: 'Medium Impact',
+        maxPoints: 30,
+        ...overrides?.[RatingCategory.MEDIUM_IMPACT],
+      },
+      [RatingCategory.LOW_IMPACT]: {
+        name: 'Low Impact',
+        maxPoints: 10,
+        ...overrides?.[RatingCategory.LOW_IMPACT],
+      },
+    };
+  }
 }
diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts
@@ -19,9 +19,7 @@ import {
   PerFileRatingContentType,
   RatingKind,
   RatingCategory,
-  POINTS_FOR_CATEGORIES,
   Rating,
-  CATEGORY_NAMES,
   RatingsResult,
 } from './rating-types.js';
 import {extractEmbeddedCodeFromTypeScript} from './embedded-languages.js';
@@ -82,10 +80,9 @@ export async function rateGeneratedCode(
     RatingCategory.MEDIUM_IMPACT,
     RatingCategory.LOW_IMPACT,
   ].map(category => ({
+    ...environment.ratingCategories[category],
     id: category,
-    name: CATEGORY_NAMES[category],
     points: 0,
-    maxPoints: POINTS_FOR_CATEGORIES[category],
     assessments: [],
   }));
 
diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts
@@ -32,20 +32,6 @@ export enum RatingCategory {
   LOW_IMPACT = 'low-impact',
 }
 
-/** Points correspond to each `RatingCategory`. */
-export const POINTS_FOR_CATEGORIES = {
-  [RatingCategory.HIGH_IMPACT]: 60,
-  [RatingCategory.MEDIUM_IMPACT]: 30,
-  [RatingCategory.LOW_IMPACT]: 10,
-};
-
-/** Display names for each `RatingCategory`. */
-export const CATEGORY_NAMES = {
-  [RatingCategory.HIGH_IMPACT]: 'High Impact',
-  [RatingCategory.MEDIUM_IMPACT]: 'Medium Impact',
-  [RatingCategory.LOW_IMPACT]: 'Low Impact',
-};
-
 const ratingCommonContextFields = {
   ratingsResult: z.record(z.custom<IndividualAssessment | SkippedIndividualAssessment>()),
   prompt: z.custom<PromptDefinition>(),
diff --git a/runner/reporting/report-ai-chat.ts b/runner/reporting/report-ai-chat.ts
@@ -12,27 +12,6 @@ import {
 } from '../shared-interfaces.js';
 import {BuildResultStatus} from '../workers/builder/builder-types.js';
 import {BUCKET_CONFIG} from '../ratings/stats.js';
-import {POINTS_FOR_CATEGORIES} from '../ratings/rating-types.js';
-
-export const reportLlmEvalsToolContext = `## What is a report?
-A report consists of many apps that were LLM generated. You will have information
-about checks that failed for this LLM generated app.
-
-Note that there may be multiple attempts for an app. E.g. an initial build may fail and
-another attempt might have repaired the build failure. The last attempt reflects the final
-state of the app. E.g. whether it does build, or if there are runtime errors.
-
-## Scoring mechanism
-Apps are rated based on their scores in the following buckets:
-${BUCKET_CONFIG.map(b => `* ${b.name}: ${b.min}-${b.max}`).join('\n')}
-
-The overall score of an app is determined based on score reductions.
-There are three pillars: ${Object.keys(POINTS_FOR_CATEGORIES).join(', ')}
-Pillars are a split up of a 100% perfect score, allowing for individual ratings
-to be less impactful than others. The pillars are distributed as follows:
-${Object.entries(POINTS_FOR_CATEGORIES).map(e => `* ${e[0]}: ${e[1]} points.`)}
-Within pillars, the available score can be reduced by individual ratings.
-`;
 
 const defaultAiChatPrompt = `Strictly follow the instructions here.
 - You are an expert in LLM-based code generation evaluation and quality assessments.
@@ -90,7 +69,7 @@ export async function chatWithReportAI(
 ${message}
 \`\`\`
 
-${reportLlmEvalsToolContext}
+${getContextPrompt(assessmentsToProcess)}
 
 ### How many apps are there?
 There are ${allAssessments.length} apps in this report.
@@ -193,3 +172,36 @@ function isAssessmentResultWithID(
 ): value is AssessmentResultFromReportServer {
   return (value as Partial<AssessmentResultFromReportServer>).id !== undefined;
 }
+
+function getContextPrompt(assessments: AssessmentResultFromReportServer[] | AssessmentResult[]) {
+  let categoryCount = 0;
+  let pointsForCategories = {} as Record<string, number>;
+
+  // Deduce the categories from the first result since they're the same for the entire run.
+  if (assessments.length) {
+    assessments[0].score.categories.forEach(category => {
+      categoryCount++;
+      pointsForCategories[category.id] = category.maxPoints;
+    });
+  }
+
+  return `## What is a report?
+A report consists of many apps that were LLM generated. You will have information
+about checks that failed for this LLM generated app.
+
+Note that there may be multiple attempts for an app. E.g. an initial build may fail and
+another attempt might have repaired the build failure. The last attempt reflects the final
+state of the app. E.g. whether it does build, or if there are runtime errors.
+
+## Scoring mechanism
+Apps are rated based on their scores in the following buckets:
+${BUCKET_CONFIG.map(b => `* ${b.name}: ${b.min}-${b.max}`).join('\n')}
+
+The overall score of an app is determined based on score reductions.
+There are ${categoryCount} pillars: ${Object.keys(pointsForCategories).join(', ')}
+Pillars are a split up of a 100% perfect score, allowing for individual ratings
+to be less impactful than others. The pillars are distributed as follows:
+${Object.entries(pointsForCategories).map(e => `* ${e[0]}: ${e[1]} points.`)}
+Within pillars, the available score can be reduced by individual ratings.
+`;
+}
diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts
@@ -174,7 +174,7 @@ export interface LlmContextFile {
 export interface AssessmentCategory {
   /** Unique ID of the category. */
   id: RatingCategory;
-  /** Display name of the cateogry. */
+  /** Display name of the category. */
   name: string;
   /** Points that have been awarded to the category. */
   points: number;