fix infinite loop in profile picker and switch predictor based routing to on by default with a header to disable (#1929)

BenjaminBraunDev · web-flow · commit 78ffe619b6cb · 2025-12-05T15:36:55.000-08:00
* fix infinite loop in profile picker when using latency routing with predictor based scheduling off

* add fix in ProcessResults

* Fix type for lint

* Fix prefix cache not being being ordered properly in profile picker and set predictor scheduling to true instead of flase when no flag is present

* Change predictor based scheduling header to one that dissables it, and make it on by default when deploying with latency based routing

* Move slo profile handler into slo routing package

* Add slo aware handler
diff --git a/cmd/epp/runner/runner.go b/cmd/epp/runner/runner.go
@@ -435,7 +435,7 @@ func (r *Runner) registerInTreePlugins() {
 	plugins.Register(scorer.LoraAffinityScorerType, scorer.LoraAffinityScorerFactory)
 	// Latency predictor plugins
 	plugins.Register(slo_aware_router.SLOAwareRouterPluginType, slo_aware_router.SLOAwareRouterFactory)
-	plugins.Register(profile.SLOAwareProfileHandlerType, profile.SLOAwareProfileHandlerFactory)
+	plugins.Register(slo_aware_router.SLOAwareProfileHandlerType, slo_aware_router.SLOAwareProfileHandlerFactory)
 	// register filter for test purpose only (used in conformance tests)
 	plugins.Register(testfilter.HeaderBasedTestingFilterType, testfilter.HeaderBasedTestingFilterFactory)
 	// register response received plugin for test purpose only (used in conformance tests)
diff --git a/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/headers.go b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/headers.go
@@ -46,24 +46,8 @@ func parseFloatHeader(request schedulingtypes.LLMRequest, headerName string) (fl
 	return parsedFloat, nil
 }
 
-// parseFloatHeader retrieves a header by name, parses it as a bool,
-// and returns the value or an error if the header is missing or invalid.
-func parseBoolHeader(request schedulingtypes.LLMRequest, headerName string) (bool, error) {
-	// 1. Get header value from the map
-	headerValue, ok := request.Headers[headerName]
-	if !ok {
-		return false, nil // Header not found, return 0 and false
-	}
-
-	// 2. Parse the header value to a bool
-	parsedBool, err := strconv.ParseBool(headerValue)
-	if err != nil {
-		return false, errutil.Error{
-			Code: errutil.BadRequest,
-			Msg:  headerName + " must be a bool",
-		}
-	}
-
-	// 3. Return the successfully parsed value
-	return parsedBool, nil
+// hasHeader checks if a header key exists in the request headers map.
+func hasHeader(request schedulingtypes.LLMRequest, headerName string) bool {
+	_, ok := request.Headers[headerName]
+	return ok
 }
diff --git a/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/scorer_helpers.go b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/scorer_helpers.go
@@ -41,10 +41,7 @@ func (s *SLOAwareRouter) parseSLOHeaders(ctx context.Context, request *schedulin
 	if err != nil {
 		logger.V(logutil.DEBUG).Error(errutil.Error{Code: errutil.BadRequest, Msg: fmt.Sprintf("%v must be a float: %v", tpotSLOHeaderKey, err)}, "SLOAwareRouter: Error parsing TPOT SLO from header")
 	}
-	sloCtx.predictorBasedScheduling, err = parseBoolHeader(*request, "x-prediction-based-scheduling")
-	if err != nil {
-		logger.V(logutil.DEBUG).Error(errutil.Error{Code: errutil.BadRequest, Msg: fmt.Sprintf("x-prediction-based-scheduling must be a bool: %v", err)}, "SLOAwareRouter: Error parsing PredictorBasedScheduling from header")
-	}
+	sloCtx.predictorBasedScheduling = !hasHeader(*request, "x-prediction-based-scheduling-off")
 }
 
 func (s *SLOAwareRouter) classifyPodsByHeadroom(allPreds []podPredictionResult) (posHeadroomPods, negHeadroomPods []podPredictionResult) {
diff --git a/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/scorer_test.go b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/scorer_test.go
@@ -20,7 +20,6 @@ import (
 	"context"
 	"errors"
 	"fmt"
-	"strconv"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
@@ -127,7 +126,9 @@ func createTestLLMRequest(reqID string, ttftSLO, tpotSLO float64, predictionBase
 	if tpotSLO > 0 {
 		headers["x-avg-tpot-slo"] = fmt.Sprintf("%f", tpotSLO)
 	}
-	headers["x-prediction-based-scheduling"] = strconv.FormatBool(predictionBased)
+	if !predictionBased {
+		headers["x-prediction-based-scheduling-off"] = "true"
+	}
 
 	return &schedulingtypes.LLMRequest{
 		Headers: headers,
diff --git a/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/slo_aware_profile_handler.go b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/slo_aware_profile_handler.go
@@ -14,21 +14,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package profile
+package slo_aware_router
 
 import (
 	"context"
 	"encoding/json"
 	"errors"
 	"fmt"
-	"strconv"
-
-	"sigs.k8s.io/controller-runtime/pkg/log"
 
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
-	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
 const (
@@ -38,7 +34,7 @@ const (
 	LatencyRoutingProfileName   = "predicted-latency-routing"
 
 	// Boolean header string for whether to use predictor based scheduling
-	PreictionBasedSchedulingHeaderKey = "x-prediction-based-scheduling"
+	PreictionBasedSchedulingHeaderKey = "x-prediction-based-scheduling-off"
 )
 
 // compile-time type assertion
@@ -79,38 +75,36 @@ func (h *SLOAwareProfileHandler) WithName(name string) *SLOAwareProfileHandler {
 func (h *SLOAwareProfileHandler) Pick(ctx context.Context, _ *types.CycleState, request *types.LLMRequest, profiles map[string]*framework.SchedulerProfile,
 	profileResults map[string]*types.ProfileRunResult) map[string]*framework.SchedulerProfile {
 
-	logger := log.FromContext(ctx)
+	predictorBasedScheduling := !isHeaderPresent(*request, PreictionBasedSchedulingHeaderKey)
 
-	predictorBasedScheduling, err := parseBoolHeader(*request, PreictionBasedSchedulingHeaderKey)
-	if err != nil {
-		logger.V(logutil.DEBUG).Error(err, "error parsing predictorBasedScheduling from header failed to choose scheduling profile: x-prediction-based-scheduling must be a bool")
-		return nil
+	_, prefixExecuted := profileResults[PrefixProfileName]
+	// if prefix profile was not executed yet, first let the scheduler run it
+	if !prefixExecuted {
+		return map[string]*framework.SchedulerProfile{
+			PrefixProfileName: profiles[PrefixProfileName],
+		}
 	}
 
 	if predictorBasedScheduling {
-		_, prefixExecuted := profileResults[PrefixProfileName]
 		_, routingExecuted := profileResults[LatencyRoutingProfileName]
-		if prefixExecuted && routingExecuted { // both routing profiles have been executed already in previous call
-			return map[string]*framework.SchedulerProfile{}
-		}
-
-		// if prefix profile was not executed yet, first let the scheduler run it
-		if !prefixExecuted {
+		// routing profile has not been executed yet
+		if !routingExecuted {
 			return map[string]*framework.SchedulerProfile{
-				PrefixProfileName: profiles[PrefixProfileName],
+				LatencyRoutingProfileName: profiles[LatencyRoutingProfileName],
 			}
 		}
-
-		// otherwise, return only the SLO profile to be executed next
-		return map[string]*framework.SchedulerProfile{
-			LatencyRoutingProfileName: profiles[LatencyRoutingProfileName],
+	} else {
+		_, defaultExecuted := profileResults[NoLatencyRoutingProfileName]
+		// predictorBasedScheduling is off, and NoLatencyRoutingProfileName profile has not been executed yet
+		if !defaultExecuted {
+			return map[string]*framework.SchedulerProfile{
+				NoLatencyRoutingProfileName: profiles[NoLatencyRoutingProfileName],
+			}
 		}
 	}
 
-	// If predictor based scheduling is not requested, proceed with only default profile
-	return map[string]*framework.SchedulerProfile{
-		NoLatencyRoutingProfileName: profiles[NoLatencyRoutingProfileName],
-	}
+	// all previous profiles have been executed, nothing more to run
+	return map[string]*framework.SchedulerProfile{}
 }
 
 // ProcessResults handles the outcome of the profile runs after all profiles ran.
@@ -119,16 +113,12 @@ func (h *SLOAwareProfileHandler) Pick(ctx context.Context, _ *types.CycleState,
 // When a profile run fails, its result in the profileResults map is nil.
 func (h *SLOAwareProfileHandler) ProcessResults(ctx context.Context, _ *types.CycleState, request *types.LLMRequest, profileResults map[string]*types.ProfileRunResult) (*types.SchedulingResult, error) {
 
-	if len(profileResults) < 2 {
-		return nil, errors.New("SLOAwareProfileHandler requires at least two profiles to operate")
-	}
-
-	predictorBasedScheduling, err := parseBoolHeader(*request, PreictionBasedSchedulingHeaderKey)
-	if err != nil {
-		return nil, fmt.Errorf("error parsing predictorBasedScheduling from header failed to choose scheduling profile: x-prediction-based-scheduling must be a bool: %v", err)
-	}
+	predictorBasedScheduling := !isHeaderPresent(*request, PreictionBasedSchedulingHeaderKey)
 
 	if predictorBasedScheduling { // TODO grab header directly from request.Headers instead of request field
+		if len(profileResults) < 2 {
+			return nil, errors.New("SLOAwareProfileHandler requires at least two profiles to operate when predictorBasedScheduling is true")
+		}
 		if profileResults[LatencyRoutingProfileName] == nil { // there was an error while running the SLO profile
 			return nil, fmt.Errorf("failed to run scheduler profile '%s'", LatencyRoutingProfileName)
 		}
@@ -137,6 +127,9 @@ func (h *SLOAwareProfileHandler) ProcessResults(ctx context.Context, _ *types.Cy
 			PrimaryProfileName: LatencyRoutingProfileName,
 		}, nil
 	}
+	if len(profileResults) < 1 {
+		return nil, errors.New("SLOAwareProfileHandler requires at least one profiles to operate when predictorBasedScheduling is false")
+	}
 
 	if profileResults[NoLatencyRoutingProfileName] == nil { // there was an error while running the default profile
 		return nil, fmt.Errorf("failed to run scheduler profile '%s'", NoLatencyRoutingProfileName)
@@ -148,21 +141,9 @@ func (h *SLOAwareProfileHandler) ProcessResults(ctx context.Context, _ *types.Cy
 	}, nil
 }
 
-// parseFloatHeader retrieves a header by name, parses it as a bool,
-// and returns the value or an error if the header is missing or invalid.
-func parseBoolHeader(request types.LLMRequest, headerName string) (bool, error) {
+// isHeaderPresent checks if a header key exists in the request headers map.
+func isHeaderPresent(request types.LLMRequest, headerName string) bool {
 	// 1. Get header value from the map
-	headerValue, ok := request.Headers[headerName]
-	if !ok {
-		return false, nil // Header not found, return 0 and false
-	}
-
-	// 2. Parse the header value to a bool
-	parsedBool, err := strconv.ParseBool(headerValue)
-	if err != nil {
-		return false, fmt.Errorf("must be a bool: %v", headerName)
-	}
-
-	// 3. Return the successfully parsed value
-	return parsedBool, nil
+	_, ok := request.Headers[headerName]
+	return ok
 }
diff --git a/site-src/guides/latency-based-predictor.md b/site-src/guides/latency-based-predictor.md
@@ -8,7 +8,7 @@ Latency-based routing is a feature of the Inference Gateway that enables intelli
 
 The latency-based routing feature is implemented as a plugin for the Endpoint Picker (EPP). When a request is received, the plugin performs the following steps:
 
-1.  **SLO Extraction**: The plugin extracts the TTFT and TPOT SLOs from the request headers (`x-slo-ttft-ms` and `x-slo-tpot-ms`). It also checks for the `x-prediction-based-scheduling` header to determine if latency-based routing should be used for this request.
+1.  **SLO Extraction**: The plugin extracts the TTFT and TPOT SLOs from the request headers (`x-slo-ttft-ms` and `x-slo-tpot-ms`). It also checks for the `x-prediction-based-scheduling-off` header to determine if latency-based routing should be used for this request.
 
 2.  **Latency Prediction**: The plugin uses a latency predictor, deployed as a set of sidecar containers to the EPP, to predict the TTFT and TPOT for the request on each of the available model servers. The prediction is based on the current state of the server, including its KV cache utilization, and the number of running and waiting requests.
 
@@ -22,7 +22,7 @@ The latency-based routing feature is implemented as a plugin for the Endpoint Pi
 
 To use latency-based routing, you need to include the following headers in your inference requests:
 
--   `x-prediction-based-scheduling`: Set to `true` to enable latency-based routing for the request, setting this to false or omiting the header will use non-SLO routing, but will still use the latency data to train the predictor.
+-   `x-prediction-based-scheduling-off`: Include this header to disable predictive routing for that specific request. If omitted, predictive routing is enabled by default.
 -   `x-slo-ttft-ms`: The Time to First Token SLO in milliseconds.
 -   `x-slo-tpot-ms`: The Time Per Output Token SLO in milliseconds (this is vLLMs equivalent of ITL, is it **not** NTPOT).
 
@@ -78,7 +78,7 @@ If you have a standard setup via using the [Getting Started Guide](getting-start
 ```txt
 export GW_IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'):80
 
-curl -v $GW_IP/v1/completions -H 'Content-Type: application/json' -H 'x-slo-ttft-ms: 100' -H 'x-slo-tpot-ms: 100' -H 'x-prediction-based-scheduling: true' -d '{
+curl -v $GW_IP/v1/completions -H 'Content-Type: application/json' -H 'x-slo-ttft-ms: 100' -H 'x-slo-tpot-ms: 100' -d '{
 "model": "meta-llama/Llama-3.1-8B-Instruct",
 "prompt": "Write as if you were a critic: San Francisco where the ",
 "max_tokens": 100,

Original file line number	Diff line number	Diff line change
`@@ -41,10 +41,7 @@ func (s SLOAwareRouter) parseSLOHeaders(ctx context.Context, request schedulin`
`41`	`41`	`if err != nil {`
`42`	`42`	`logger.V(logutil.DEBUG).Error(errutil.Error{Code: errutil.BadRequest, Msg: fmt.Sprintf("%v must be a float: %v", tpotSLOHeaderKey, err)}, "SLOAwareRouter: Error parsing TPOT SLO from header")`
`43`	`43`	`}`
`44`		`- sloCtx.predictorBasedScheduling, err = parseBoolHeader(*request, "x-prediction-based-scheduling")`
`45`		`- if err != nil {`
`46`		`- logger.V(logutil.DEBUG).Error(errutil.Error{Code: errutil.BadRequest, Msg: fmt.Sprintf("x-prediction-based-scheduling must be a bool: %v", err)}, "SLOAwareRouter: Error parsing PredictorBasedScheduling from header")`
`47`		`- }`
	`44`	`+ sloCtx.predictorBasedScheduling = !hasHeader(*request, "x-prediction-based-scheduling-off")`
`48`	`45`	`}`
`49`	`46`
`50`	`47`	`func (s *SLOAwareRouter) classifyPodsByHeadroom(allPreds []podPredictionResult) (posHeadroomPods, negHeadroomPods []podPredictionResult) {`